OpenStack Compute (Nova)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

9991 lines
477 KiB

  1. # Copyright 2010 United States Government as represented by the
  2. # Administrator of the National Aeronautics and Space Administration.
  3. # Copyright 2011 Justin Santa Barbara
  4. # All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  7. # not use this file except in compliance with the License. You may obtain
  8. # a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15. # License for the specific language governing permissions and limitations
  16. # under the License.
  17. """Handles all processes relating to instances (guest vms).
  18. The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
  19. handles RPC calls relating to creating instances. It is responsible for
  20. building a disk image, launching it via the underlying virtualization driver,
  21. responding to calls to check its state, attaching persistent storage, and
  22. terminating it.
  23. """
  24. import base64
  25. import binascii
  26. import contextlib
  27. import copy
  28. import functools
  29. import inspect
  30. import sys
  31. import time
  32. import traceback
  33. from cinderclient import exceptions as cinder_exception
  34. from cursive import exception as cursive_exception
  35. import eventlet.event
  36. from eventlet import greenthread
  37. import eventlet.semaphore
  38. import eventlet.timeout
  39. import futurist
  40. from keystoneauth1 import exceptions as keystone_exception
  41. import os_traits
  42. from oslo_log import log as logging
  43. import oslo_messaging as messaging
  44. from oslo_serialization import jsonutils
  45. from oslo_service import loopingcall
  46. from oslo_service import periodic_task
  47. from oslo_utils import excutils
  48. from oslo_utils import strutils
  49. from oslo_utils import timeutils
  50. from oslo_utils import units
  51. import six
  52. from six.moves import range
  53. from nova import block_device
  54. from nova.compute import api as compute
  55. from nova.compute import build_results
  56. from nova.compute import claims
  57. from nova.compute import power_state
  58. from nova.compute import resource_tracker
  59. from nova.compute import rpcapi as compute_rpcapi
  60. from nova.compute import task_states
  61. from nova.compute import utils as compute_utils
  62. from nova.compute.utils import wrap_instance_event
  63. from nova.compute import vm_states
  64. from nova import conductor
  65. import nova.conf
  66. from nova.console import rpcapi as console_rpcapi
  67. import nova.context
  68. from nova import exception
  69. from nova import exception_wrapper
  70. from nova import hooks
  71. from nova.i18n import _
  72. from nova import image
  73. from nova import manager
  74. from nova import network
  75. from nova.network import base_api as base_net_api
  76. from nova.network import model as network_model
  77. from nova.network.security_group import openstack_driver
  78. from nova import objects
  79. from nova.objects import base as obj_base
  80. from nova.objects import external_event as external_event_obj
  81. from nova.objects import fields
  82. from nova.objects import instance as obj_instance
  83. from nova.objects import migrate_data as migrate_data_obj
  84. from nova.pci import request as pci_req_module
  85. from nova.pci import whitelist
  86. from nova import rpc
  87. from nova import safe_utils
  88. from nova.scheduler.client import query
  89. from nova.scheduler.client import report
  90. from nova.scheduler import utils as scheduler_utils
  91. from nova import utils
  92. from nova.virt import block_device as driver_block_device
  93. from nova.virt import configdrive
  94. from nova.virt import driver
  95. from nova.virt import event as virtevent
  96. from nova.virt import hardware
  97. from nova.virt import storage_users
  98. from nova.virt import virtapi
  99. from nova.volume import cinder
  100. CONF = nova.conf.CONF
  101. LOG = logging.getLogger(__name__)
  102. get_notifier = functools.partial(rpc.get_notifier, service='compute')
  103. wrap_exception = functools.partial(exception_wrapper.wrap_exception,
  104. get_notifier=get_notifier,
  105. binary='nova-compute')
  106. @contextlib.contextmanager
  107. def errors_out_migration_ctxt(migration):
  108. """Context manager to error out migration on failure."""
  109. try:
  110. yield
  111. except Exception:
  112. with excutils.save_and_reraise_exception():
  113. if migration:
  114. # We may have been passed None for our migration if we're
  115. # receiving from an older client. The migration will be
  116. # errored via the legacy path.
  117. migration.status = 'error'
  118. try:
  119. migration.save()
  120. except Exception:
  121. LOG.debug(
  122. 'Error setting migration status for instance %s.',
  123. migration.instance_uuid, exc_info=True)
  124. @utils.expects_func_args('migration')
  125. def errors_out_migration(function):
  126. """Decorator to error out migration on failure."""
  127. @functools.wraps(function)
  128. def decorated_function(self, context, *args, **kwargs):
  129. wrapped_func = safe_utils.get_wrapped_function(function)
  130. keyed_args = inspect.getcallargs(wrapped_func, self, context,
  131. *args, **kwargs)
  132. migration = keyed_args['migration']
  133. with errors_out_migration_ctxt(migration):
  134. return function(self, context, *args, **kwargs)
  135. return decorated_function
  136. @utils.expects_func_args('instance')
  137. def reverts_task_state(function):
  138. """Decorator to revert task_state on failure."""
  139. @functools.wraps(function)
  140. def decorated_function(self, context, *args, **kwargs):
  141. try:
  142. return function(self, context, *args, **kwargs)
  143. except exception.UnexpectedTaskStateError as e:
  144. # Note(maoy): unexpected task state means the current
  145. # task is preempted. Do not clear task state in this
  146. # case.
  147. with excutils.save_and_reraise_exception():
  148. LOG.info("Task possibly preempted: %s",
  149. e.format_message())
  150. except Exception:
  151. with excutils.save_and_reraise_exception():
  152. wrapped_func = safe_utils.get_wrapped_function(function)
  153. keyed_args = inspect.getcallargs(wrapped_func, self, context,
  154. *args, **kwargs)
  155. # NOTE(mriedem): 'instance' must be in keyed_args because we
  156. # have utils.expects_func_args('instance') decorating this
  157. # method.
  158. instance = keyed_args['instance']
  159. original_task_state = instance.task_state
  160. try:
  161. self._instance_update(context, instance, task_state=None)
  162. LOG.info("Successfully reverted task state from %s on "
  163. "failure for instance.",
  164. original_task_state, instance=instance)
  165. except exception.InstanceNotFound:
  166. # We might delete an instance that failed to build shortly
  167. # after it errored out this is an expected case and we
  168. # should not trace on it.
  169. pass
  170. except Exception as e:
  171. LOG.warning("Failed to revert task state for instance. "
  172. "Error: %s", e, instance=instance)
  173. return decorated_function
  174. @utils.expects_func_args('instance')
  175. def wrap_instance_fault(function):
  176. """Wraps a method to catch exceptions related to instances.
  177. This decorator wraps a method to catch any exceptions having to do with
  178. an instance that may get thrown. It then logs an instance fault in the db.
  179. """
  180. @functools.wraps(function)
  181. def decorated_function(self, context, *args, **kwargs):
  182. try:
  183. return function(self, context, *args, **kwargs)
  184. except exception.InstanceNotFound:
  185. raise
  186. except Exception as e:
  187. # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
  188. # we will get a KeyError exception which will cover up the real
  189. # exception. So, we update kwargs with the values from args first.
  190. # then, we can get 'instance' from kwargs easily.
  191. kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
  192. with excutils.save_and_reraise_exception():
  193. compute_utils.add_instance_fault_from_exc(context,
  194. kwargs['instance'], e, sys.exc_info())
  195. return decorated_function
  196. @utils.expects_func_args('image_id', 'instance')
  197. def delete_image_on_error(function):
  198. """Used for snapshot related method to ensure the image created in
  199. compute.api is deleted when an error occurs.
  200. """
  201. @functools.wraps(function)
  202. def decorated_function(self, context, image_id, instance,
  203. *args, **kwargs):
  204. try:
  205. return function(self, context, image_id, instance,
  206. *args, **kwargs)
  207. except Exception:
  208. with excutils.save_and_reraise_exception():
  209. compute_utils.delete_image(
  210. context, instance, self.image_api, image_id,
  211. log_exc_info=True)
  212. return decorated_function
  213. # TODO(danms): Remove me after Icehouse
  214. # TODO(alaski): Actually remove this after Newton, assuming a major RPC bump
  215. # NOTE(mikal): if the method being decorated has more than one decorator, then
  216. # put this one first. Otherwise the various exception handling decorators do
  217. # not function correctly.
  218. def object_compat(function):
  219. """Wraps a method that expects a new-world instance
  220. This provides compatibility for callers passing old-style dict
  221. instances.
  222. """
  223. @functools.wraps(function)
  224. def decorated_function(self, context, *args, **kwargs):
  225. def _load_instance(instance_or_dict):
  226. if isinstance(instance_or_dict, dict):
  227. # try to get metadata and system_metadata for most cases but
  228. # only attempt to load those if the db instance already has
  229. # those fields joined
  230. metas = [meta for meta in ('metadata', 'system_metadata')
  231. if meta in instance_or_dict]
  232. instance = objects.Instance._from_db_object(
  233. context, objects.Instance(), instance_or_dict,
  234. expected_attrs=metas)
  235. instance._context = context
  236. return instance
  237. return instance_or_dict
  238. try:
  239. kwargs['instance'] = _load_instance(kwargs['instance'])
  240. except KeyError:
  241. args = (_load_instance(args[0]),) + args[1:]
  242. migration = kwargs.get('migration')
  243. if isinstance(migration, dict):
  244. migration = objects.Migration._from_db_object(
  245. context.elevated(), objects.Migration(),
  246. migration)
  247. kwargs['migration'] = migration
  248. return function(self, context, *args, **kwargs)
  249. return decorated_function
  250. class InstanceEvents(object):
  251. def __init__(self):
  252. self._events = {}
  253. @staticmethod
  254. def _lock_name(instance):
  255. return '%s-%s' % (instance.uuid, 'events')
  256. def prepare_for_instance_event(self, instance, name, tag):
  257. """Prepare to receive an event for an instance.
  258. This will register an event for the given instance that we will
  259. wait on later. This should be called before initiating whatever
  260. action will trigger the event. The resulting eventlet.event.Event
  261. object should be wait()'d on to ensure completion.
  262. :param instance: the instance for which the event will be generated
  263. :param name: the name of the event we're expecting
  264. :param tag: the tag associated with the event we're expecting
  265. :returns: an event object that should be wait()'d on
  266. """
  267. if self._events is None:
  268. # NOTE(danms): We really should have a more specific error
  269. # here, but this is what we use for our default error case
  270. raise exception.NovaException('In shutdown, no new events '
  271. 'can be scheduled')
  272. @utils.synchronized(self._lock_name(instance))
  273. def _create_or_get_event():
  274. instance_events = self._events.setdefault(instance.uuid, {})
  275. return instance_events.setdefault((name, tag),
  276. eventlet.event.Event())
  277. LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
  278. {'name': name, 'tag': tag}, instance=instance)
  279. return _create_or_get_event()
  280. def pop_instance_event(self, instance, event):
  281. """Remove a pending event from the wait list.
  282. This will remove a pending event from the wait list so that it
  283. can be used to signal the waiters to wake up.
  284. :param instance: the instance for which the event was generated
  285. :param event: the nova.objects.external_event.InstanceExternalEvent
  286. that describes the event
  287. :returns: the eventlet.event.Event object on which the waiters
  288. are blocked
  289. """
  290. no_events_sentinel = object()
  291. no_matching_event_sentinel = object()
  292. @utils.synchronized(self._lock_name(instance))
  293. def _pop_event():
  294. if self._events is None:
  295. LOG.debug('Unexpected attempt to pop events during shutdown',
  296. instance=instance)
  297. return no_events_sentinel
  298. events = self._events.get(instance.uuid)
  299. if not events:
  300. return no_events_sentinel
  301. _event = events.pop((event.name, event.tag), None)
  302. if not events:
  303. del self._events[instance.uuid]
  304. if _event is None:
  305. return no_matching_event_sentinel
  306. return _event
  307. result = _pop_event()
  308. if result is no_events_sentinel:
  309. LOG.debug('No waiting events found dispatching %(event)s',
  310. {'event': event.key},
  311. instance=instance)
  312. return None
  313. elif result is no_matching_event_sentinel:
  314. LOG.debug('No event matching %(event)s in %(events)s',
  315. {'event': event.key,
  316. 'events': self._events.get(instance.uuid, {}).keys()},
  317. instance=instance)
  318. return None
  319. else:
  320. return result
  321. def clear_events_for_instance(self, instance):
  322. """Remove all pending events for an instance.
  323. This will remove all events currently pending for an instance
  324. and return them (indexed by event name).
  325. :param instance: the instance for which events should be purged
  326. :returns: a dictionary of {event_name: eventlet.event.Event}
  327. """
  328. @utils.synchronized(self._lock_name(instance))
  329. def _clear_events():
  330. if self._events is None:
  331. LOG.debug('Unexpected attempt to clear events during shutdown',
  332. instance=instance)
  333. return dict()
  334. # NOTE(danms): We have historically returned the raw internal
  335. # format here, which is {event.key: [events, ...])} so just
  336. # trivially convert it here.
  337. return {'%s-%s' % k: e
  338. for k, e in self._events.pop(instance.uuid, {}).items()}
  339. return _clear_events()
  340. def cancel_all_events(self):
  341. if self._events is None:
  342. LOG.debug('Unexpected attempt to cancel events during shutdown.')
  343. return
  344. our_events = self._events
  345. # NOTE(danms): Block new events
  346. self._events = None
  347. for instance_uuid, events in our_events.items():
  348. for (name, tag), eventlet_event in events.items():
  349. LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
  350. 'instance %(instance_uuid)s',
  351. {'name': name,
  352. 'tag': tag,
  353. 'instance_uuid': instance_uuid})
  354. event = objects.InstanceExternalEvent(
  355. instance_uuid=instance_uuid,
  356. name=name, status='failed',
  357. tag=tag, data={})
  358. eventlet_event.send(event)
  359. class ComputeVirtAPI(virtapi.VirtAPI):
  360. def __init__(self, compute):
  361. super(ComputeVirtAPI, self).__init__()
  362. self._compute = compute
  363. self.reportclient = compute.reportclient
  364. def _default_error_callback(self, event_name, instance):
  365. raise exception.NovaException(_('Instance event failed'))
  366. @contextlib.contextmanager
  367. def wait_for_instance_event(self, instance, event_names, deadline=300,
  368. error_callback=None):
  369. """Plan to wait for some events, run some code, then wait.
  370. This context manager will first create plans to wait for the
  371. provided event_names, yield, and then wait for all the scheduled
  372. events to complete.
  373. Note that this uses an eventlet.timeout.Timeout to bound the
  374. operation, so callers should be prepared to catch that
  375. failure and handle that situation appropriately.
  376. If the event is not received by the specified timeout deadline,
  377. eventlet.timeout.Timeout is raised.
  378. If the event is received but did not have a 'completed'
  379. status, a NovaException is raised. If an error_callback is
  380. provided, instead of raising an exception as detailed above
  381. for the failure case, the callback will be called with the
  382. event_name and instance, and can return True to continue
  383. waiting for the rest of the events, False to stop processing,
  384. or raise an exception which will bubble up to the waiter.
  385. :param instance: The instance for which an event is expected
  386. :param event_names: A list of event names. Each element is a
  387. tuple of strings to indicate (name, tag),
  388. where name is required, but tag may be None.
  389. :param deadline: Maximum number of seconds we should wait for all
  390. of the specified events to arrive.
  391. :param error_callback: A function to be called if an event arrives
  392. """
  393. if error_callback is None:
  394. error_callback = self._default_error_callback
  395. events = {}
  396. for event_name in event_names:
  397. name, tag = event_name
  398. event_name = objects.InstanceExternalEvent.make_key(name, tag)
  399. try:
  400. events[event_name] = (
  401. self._compute.instance_events.prepare_for_instance_event(
  402. instance, name, tag))
  403. except exception.NovaException:
  404. error_callback(event_name, instance)
  405. # NOTE(danms): Don't wait for any of the events. They
  406. # should all be canceled and fired immediately below,
  407. # but don't stick around if not.
  408. deadline = 0
  409. yield
  410. with eventlet.timeout.Timeout(deadline):
  411. for event_name, event in events.items():
  412. actual_event = event.wait()
  413. if actual_event.status == 'completed':
  414. continue
  415. decision = error_callback(event_name, instance)
  416. if decision is False:
  417. break
  418. def update_compute_provider_status(self, context, rp_uuid, enabled):
  419. """Used to add/remove the COMPUTE_STATUS_DISABLED trait on the provider
  420. :param context: nova auth RequestContext
  421. :param rp_uuid: UUID of a compute node resource provider in Placement
  422. :param enabled: True if the node is enabled in which case the trait
  423. would be removed, False if the node is disabled in which case
  424. the trait would be added.
  425. :raises: ResourceProviderTraitRetrievalFailed
  426. :raises: ResourceProviderUpdateConflict
  427. :raises: ResourceProviderUpdateFailed
  428. :raises: TraitRetrievalFailed
  429. :raises: keystoneauth1.exceptions.ClientException
  430. """
  431. trait_name = os_traits.COMPUTE_STATUS_DISABLED
  432. # Get the current traits (and generation) for the provider.
  433. # TODO(mriedem): Leverage the ProviderTree cache in get_provider_traits
  434. trait_info = self.reportclient.get_provider_traits(context, rp_uuid)
  435. # If the host is enabled, remove the trait (if set), else add
  436. # the trait if it doesn't already exist.
  437. original_traits = trait_info.traits
  438. new_traits = None
  439. if enabled and trait_name in original_traits:
  440. new_traits = original_traits - {trait_name}
  441. LOG.debug('Removing trait %s from compute node resource '
  442. 'provider %s in placement.', trait_name, rp_uuid)
  443. elif not enabled and trait_name not in original_traits:
  444. new_traits = original_traits | {trait_name}
  445. LOG.debug('Adding trait %s to compute node resource '
  446. 'provider %s in placement.', trait_name, rp_uuid)
  447. if new_traits is not None:
  448. self.reportclient.set_traits_for_provider(
  449. context, rp_uuid, new_traits)
  450. class ComputeManager(manager.Manager):
  451. """Manages the running instances from creation to destruction."""
  452. target = messaging.Target(version='5.7')
  453. def __init__(self, compute_driver=None, *args, **kwargs):
  454. """Load configuration options and connect to the hypervisor."""
  455. # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all
  456. # using the same instance of SchedulerReportClient which has the
  457. # ProviderTree cache for this compute service.
  458. self.reportclient = report.SchedulerReportClient()
  459. self.virtapi = ComputeVirtAPI(self)
  460. self.network_api = network.API()
  461. self.volume_api = cinder.API()
  462. self.image_api = image.API()
  463. self._last_bw_usage_poll = 0
  464. self._bw_usage_supported = True
  465. self.compute_api = compute.API()
  466. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  467. self.compute_task_api = conductor.ComputeTaskAPI()
  468. self.is_neutron_security_groups = (
  469. openstack_driver.is_neutron_security_groups())
  470. self.query_client = query.SchedulerQueryClient()
  471. self.instance_events = InstanceEvents()
  472. self._sync_power_pool = eventlet.GreenPool(
  473. size=CONF.sync_power_state_pool_size)
  474. self._syncs_in_progress = {}
  475. self.send_instance_updates = (
  476. CONF.filter_scheduler.track_instance_changes)
  477. if CONF.max_concurrent_builds != 0:
  478. self._build_semaphore = eventlet.semaphore.Semaphore(
  479. CONF.max_concurrent_builds)
  480. else:
  481. self._build_semaphore = compute_utils.UnlimitedSemaphore()
  482. if CONF.max_concurrent_live_migrations > 0:
  483. self._live_migration_executor = futurist.GreenThreadPoolExecutor(
  484. max_workers=CONF.max_concurrent_live_migrations)
  485. else:
  486. # CONF.max_concurrent_live_migrations is 0 (unlimited)
  487. self._live_migration_executor = futurist.GreenThreadPoolExecutor()
  488. # This is a dict, keyed by instance uuid, to a two-item tuple of
  489. # migration object and Future for the queued live migration.
  490. self._waiting_live_migrations = {}
  491. super(ComputeManager, self).__init__(service_name="compute",
  492. *args, **kwargs)
  493. # NOTE(russellb) Load the driver last. It may call back into the
  494. # compute manager via the virtapi, so we want it to be fully
  495. # initialized before that happens.
  496. self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
  497. self.use_legacy_block_device_info = \
  498. self.driver.need_legacy_block_device_info
  499. self.rt = resource_tracker.ResourceTracker(
  500. self.host, self.driver, reportclient=self.reportclient)
  501. def reset(self):
  502. LOG.info('Reloading compute RPC API')
  503. compute_rpcapi.reset_globals()
  504. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  505. self.reportclient.clear_provider_cache()
  506. def _update_resource_tracker(self, context, instance):
  507. """Let the resource tracker know that an instance has changed state."""
  508. if instance.host == self.host:
  509. self.rt.update_usage(context, instance, instance.node)
  510. def _instance_update(self, context, instance, **kwargs):
  511. """Update an instance in the database using kwargs as value."""
  512. for k, v in kwargs.items():
  513. setattr(instance, k, v)
  514. instance.save()
  515. self._update_resource_tracker(context, instance)
  516. def _nil_out_instance_obj_host_and_node(self, instance):
  517. # NOTE(jwcroppe): We don't do instance.save() here for performance
  518. # reasons; a call to this is expected to be immediately followed by
  519. # another call that does instance.save(), thus avoiding two writes
  520. # to the database layer.
  521. instance.host = None
  522. instance.node = None
  523. # ResourceTracker._set_instance_host_and_node also sets launched_on
  524. # to the same value as host and is really only ever used by legacy
  525. # nova-network code, but we should also null it out to avoid confusion
  526. # if there is an instance in the database with no host set but
  527. # launched_on is set. Note that we do not care about using launched_on
  528. # as some kind of debug helper if diagnosing a build failure, that is
  529. # what instance action events are for.
  530. instance.launched_on = None
  531. # If the instance is not on a host, it's not in an aggregate and
  532. # therefore is not in an availability zone.
  533. instance.availability_zone = None
  534. def _set_instance_obj_error_state(self, context, instance,
  535. clean_task_state=False):
  536. try:
  537. instance.vm_state = vm_states.ERROR
  538. if clean_task_state:
  539. instance.task_state = None
  540. instance.save()
  541. except exception.InstanceNotFound:
  542. LOG.debug('Instance has been destroyed from under us while '
  543. 'trying to set it to ERROR', instance=instance)
  544. def _get_instances_on_driver(self, context, filters=None):
  545. """Return a list of instance records for the instances found
  546. on the hypervisor which satisfy the specified filters. If filters=None
  547. return a list of instance records for all the instances found on the
  548. hypervisor.
  549. """
  550. if not filters:
  551. filters = {}
  552. try:
  553. driver_uuids = self.driver.list_instance_uuids()
  554. if len(driver_uuids) == 0:
  555. # Short circuit, don't waste a DB call
  556. return objects.InstanceList()
  557. filters['uuid'] = driver_uuids
  558. local_instances = objects.InstanceList.get_by_filters(
  559. context, filters, use_slave=True)
  560. return local_instances
  561. except NotImplementedError:
  562. pass
  563. # The driver doesn't support uuids listing, so we'll have
  564. # to brute force.
  565. driver_instances = self.driver.list_instances()
  566. # NOTE(mjozefcz): In this case we need to apply host filter.
  567. # Without this all instance data would be fetched from db.
  568. filters['host'] = self.host
  569. instances = objects.InstanceList.get_by_filters(context, filters,
  570. use_slave=True)
  571. name_map = {instance.name: instance for instance in instances}
  572. local_instances = []
  573. for driver_instance in driver_instances:
  574. instance = name_map.get(driver_instance)
  575. if not instance:
  576. continue
  577. local_instances.append(instance)
  578. return local_instances
  579. def _destroy_evacuated_instances(self, context, node_cache):
  580. """Destroys evacuated instances.
  581. While nova-compute was down, the instances running on it could be
  582. evacuated to another host. This method looks for evacuation migration
  583. records where this is the source host and which were either started
  584. (accepted), in-progress (pre-migrating) or migrated (done). From those
  585. migration records, local instances reported by the hypervisor are
  586. compared to the instances for the migration records and those local
  587. guests are destroyed, along with instance allocation records in
  588. Placement for this node.
  589. Then allocations are removed from Placement for every instance that is
  590. evacuated from this host regardless if the instance is reported by the
  591. hypervisor or not.
  592. :param context: The request context
  593. :param node_cache: A dict of ComputeNode objects keyed by the UUID of
  594. the compute node
  595. :return: A dict keyed by instance uuid mapped to Migration objects
  596. for instances that were migrated away from this host
  597. """
  598. filters = {
  599. 'source_compute': self.host,
  600. # NOTE(mriedem): Migration records that have been accepted are
  601. # included in case the source node comes back up while instances
  602. # are being evacuated to another host. We don't want the same
  603. # instance being reported from multiple hosts.
  604. # NOTE(lyarwood): pre-migrating is also included here as the
  605. # source compute can come back online shortly after the RT
  606. # claims on the destination that in-turn moves the migration to
  607. # pre-migrating. If the evacuate fails on the destination host,
  608. # the user can rebuild the instance (in ERROR state) on the source
  609. # host.
  610. 'status': ['accepted', 'pre-migrating', 'done'],
  611. 'migration_type': 'evacuation',
  612. }
  613. with utils.temporary_mutation(context, read_deleted='yes'):
  614. evacuations = objects.MigrationList.get_by_filters(context,
  615. filters)
  616. if not evacuations:
  617. return {}
  618. evacuations = {mig.instance_uuid: mig for mig in evacuations}
  619. # TODO(mriedem): We could optimize by pre-loading the joined fields
  620. # we know we'll use, like info_cache and flavor.
  621. local_instances = self._get_instances_on_driver(context)
  622. evacuated_local_instances = {inst.uuid: inst
  623. for inst in local_instances
  624. if inst.uuid in evacuations}
  625. for instance in evacuated_local_instances.values():
  626. LOG.info('Destroying instance as it has been evacuated from '
  627. 'this host but still exists in the hypervisor',
  628. instance=instance)
  629. try:
  630. network_info = self.network_api.get_instance_nw_info(
  631. context, instance)
  632. bdi = self._get_instance_block_device_info(context,
  633. instance)
  634. destroy_disks = not (self._is_instance_storage_shared(
  635. context, instance))
  636. except exception.InstanceNotFound:
  637. network_info = network_model.NetworkInfo()
  638. bdi = {}
  639. LOG.info('Instance has been marked deleted already, '
  640. 'removing it from the hypervisor.',
  641. instance=instance)
  642. # always destroy disks if the instance was deleted
  643. destroy_disks = True
  644. self.driver.destroy(context, instance,
  645. network_info,
  646. bdi, destroy_disks)
  647. hostname_to_cn_uuid = {
  648. cn.hypervisor_hostname: cn.uuid
  649. for cn in node_cache.values()}
  650. for instance_uuid, migration in evacuations.items():
  651. try:
  652. if instance_uuid in evacuated_local_instances:
  653. # Avoid the db call if we already have the instance loaded
  654. # above
  655. instance = evacuated_local_instances[instance_uuid]
  656. else:
  657. instance = objects.Instance.get_by_uuid(
  658. context, instance_uuid)
  659. except exception.InstanceNotFound:
  660. # The instance already deleted so we expect that every
  661. # allocation of that instance has already been cleaned up
  662. continue
  663. LOG.info('Cleaning up allocations of the instance as it has been '
  664. 'evacuated from this host',
  665. instance=instance)
  666. if migration.source_node not in hostname_to_cn_uuid:
  667. LOG.error("Failed to clean allocation of evacuated "
  668. "instance as the source node %s is not found",
  669. migration.source_node, instance=instance)
  670. continue
  671. cn_uuid = hostname_to_cn_uuid[migration.source_node]
  672. # If the instance was deleted in the interim, assume its
  673. # allocations were properly cleaned up (either by its hosting
  674. # compute service or the API).
  675. if (not instance.deleted and
  676. not self.reportclient.
  677. remove_provider_tree_from_instance_allocation(
  678. context, instance.uuid, cn_uuid)):
  679. LOG.error("Failed to clean allocation of evacuated instance "
  680. "on the source node %s",
  681. cn_uuid, instance=instance)
  682. migration.status = 'completed'
  683. migration.save()
  684. return evacuations
  685. def _is_instance_storage_shared(self, context, instance, host=None):
  686. shared_storage = True
  687. data = None
  688. try:
  689. data = self.driver.check_instance_shared_storage_local(context,
  690. instance)
  691. if data:
  692. shared_storage = (self.compute_rpcapi.
  693. check_instance_shared_storage(context,
  694. instance, data, host=host))
  695. except NotImplementedError:
  696. LOG.debug('Hypervisor driver does not support '
  697. 'instance shared storage check, '
  698. 'assuming it\'s not on shared storage',
  699. instance=instance)
  700. shared_storage = False
  701. except Exception:
  702. LOG.exception('Failed to check if instance shared',
  703. instance=instance)
  704. finally:
  705. if data:
  706. self.driver.check_instance_shared_storage_cleanup(context,
  707. data)
  708. return shared_storage
  709. def _complete_partial_deletion(self, context, instance):
  710. """Complete deletion for instances in DELETED status but not marked as
  711. deleted in the DB
  712. """
  713. instance.destroy()
  714. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  715. context, instance.uuid)
  716. self._complete_deletion(context,
  717. instance)
  718. self._notify_about_instance_usage(context, instance, "delete.end")
  719. compute_utils.notify_about_instance_action(context, instance,
  720. self.host, action=fields.NotificationAction.DELETE,
  721. phase=fields.NotificationPhase.END, bdms=bdms)
  722. def _complete_deletion(self, context, instance):
  723. self._update_resource_tracker(context, instance)
  724. self.reportclient.delete_allocation_for_instance(context,
  725. instance.uuid)
  726. self._clean_instance_console_tokens(context, instance)
  727. self._delete_scheduler_instance_info(context, instance.uuid)
  728. def _validate_pinning_configuration(self, instances):
  729. if not self.driver.capabilities.get('supports_pcpus', False):
  730. return
  731. for instance in instances:
  732. # ignore deleted instances
  733. if instance.deleted:
  734. continue
  735. # if this is an unpinned instance and the host only has
  736. # 'cpu_dedicated_set' configured, we need to tell the operator to
  737. # correct their configuration
  738. if not (instance.numa_topology and
  739. instance.numa_topology.cpu_pinning_requested):
  740. # we don't need to check 'vcpu_pin_set' since it can't coexist
  741. # alongside 'cpu_dedicated_set'
  742. if (CONF.compute.cpu_dedicated_set and
  743. not CONF.compute.cpu_shared_set):
  744. msg = _("This host has unpinned instances but has no CPUs "
  745. "set aside for this purpose; configure '[compute] "
  746. "cpu_shared_set' instead of, or in addition to, "
  747. "'[compute] cpu_dedicated_set'")
  748. raise exception.InvalidConfiguration(msg)
  749. continue
  750. # ditto for pinned instances if only 'cpu_shared_set' is configured
  751. if (CONF.compute.cpu_shared_set and
  752. not CONF.compute.cpu_dedicated_set and
  753. not CONF.vcpu_pin_set):
  754. msg = _("This host has pinned instances but has no CPUs "
  755. "set aside for this purpose; configure '[compute] "
  756. "cpu_dedicated_set' instead of, or in addition to, "
  757. "'[compute] cpu_shared_set'")
  758. raise exception.InvalidConfiguration(msg)
  759. # also check to make sure the operator hasn't accidentally
  760. # dropped some cores that instances are currently using
  761. available_dedicated_cpus = (hardware.get_vcpu_pin_set() or
  762. hardware.get_cpu_dedicated_set())
  763. pinned_cpus = instance.numa_topology.cpu_pinning
  764. if available_dedicated_cpus and (
  765. pinned_cpus - available_dedicated_cpus):
  766. # we can't raise an exception because of bug #1289064,
  767. # which meant we didn't recalculate CPU pinning information
  768. # when we live migrated a pinned instance
  769. LOG.warning(
  770. "Instance is pinned to host CPUs %(cpus)s "
  771. "but one or more of these CPUs are not included in "
  772. "either '[compute] cpu_dedicated_set' or "
  773. "'vcpu_pin_set'; you should update these "
  774. "configuration options to include the missing CPUs "
  775. "or rebuild or cold migrate this instance.",
  776. {'cpus': list(pinned_cpus)},
  777. instance=instance)
  778. def _reset_live_migration(self, context, instance):
  779. migration = None
  780. try:
  781. migration = objects.Migration.get_by_instance_and_status(
  782. context, instance.uuid, 'running')
  783. if migration:
  784. self.live_migration_abort(context, instance, migration.id)
  785. except Exception:
  786. LOG.exception('Failed to abort live-migration',
  787. instance=instance)
  788. finally:
  789. if migration:
  790. self._set_migration_status(migration, 'error')
  791. LOG.info('Instance found in migrating state during '
  792. 'startup. Resetting task_state',
  793. instance=instance)
  794. instance.task_state = None
  795. instance.save(expected_task_state=[task_states.MIGRATING])
  796. def _init_instance(self, context, instance):
  797. """Initialize this instance during service init."""
  798. # NOTE(danms): If the instance appears to not be owned by this
  799. # host, it may have been evacuated away, but skipped by the
  800. # evacuation cleanup code due to configuration. Thus, if that
  801. # is a possibility, don't touch the instance in any way, but
  802. # log the concern. This will help avoid potential issues on
  803. # startup due to misconfiguration.
  804. if instance.host != self.host:
  805. LOG.warning('Instance %(uuid)s appears to not be owned '
  806. 'by this host, but by %(host)s. Startup '
  807. 'processing is being skipped.',
  808. {'uuid': instance.uuid,
  809. 'host': instance.host})
  810. return
  811. # Instances that are shut down, or in an error state can not be
  812. # initialized and are not attempted to be recovered. The exception
  813. # to this are instances that are in RESIZE_MIGRATING or DELETING,
  814. # which are dealt with further down.
  815. if (instance.vm_state == vm_states.SOFT_DELETED or
  816. (instance.vm_state == vm_states.ERROR and
  817. instance.task_state not in
  818. (task_states.RESIZE_MIGRATING, task_states.DELETING))):
  819. LOG.debug("Instance is in %s state.",
  820. instance.vm_state, instance=instance)
  821. return
  822. if instance.vm_state == vm_states.DELETED:
  823. try:
  824. self._complete_partial_deletion(context, instance)
  825. except Exception:
  826. # we don't want that an exception blocks the init_host
  827. LOG.exception('Failed to complete a deletion',
  828. instance=instance)
  829. return
  830. if (instance.vm_state == vm_states.BUILDING or
  831. instance.task_state in [task_states.SCHEDULING,
  832. task_states.BLOCK_DEVICE_MAPPING,
  833. task_states.NETWORKING,
  834. task_states.SPAWNING]):
  835. # NOTE(dave-mcnally) compute stopped before instance was fully
  836. # spawned so set to ERROR state. This is safe to do as the state
  837. # may be set by the api but the host is not so if we get here the
  838. # instance has already been scheduled to this particular host.
  839. LOG.debug("Instance failed to spawn correctly, "
  840. "setting to ERROR state", instance=instance)
  841. self._set_instance_obj_error_state(
  842. context, instance, clean_task_state=True)
  843. return
  844. if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
  845. instance.task_state in [task_states.REBUILDING,
  846. task_states.REBUILD_BLOCK_DEVICE_MAPPING,
  847. task_states.REBUILD_SPAWNING]):
  848. # NOTE(jichenjc) compute stopped before instance was fully
  849. # spawned so set to ERROR state. This is consistent to BUILD
  850. LOG.debug("Instance failed to rebuild correctly, "
  851. "setting to ERROR state", instance=instance)
  852. self._set_instance_obj_error_state(
  853. context, instance, clean_task_state=True)
  854. return
  855. if (instance.vm_state != vm_states.ERROR and
  856. instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
  857. task_states.IMAGE_PENDING_UPLOAD,
  858. task_states.IMAGE_UPLOADING,
  859. task_states.IMAGE_SNAPSHOT]):
  860. LOG.debug("Instance in transitional state %s at start-up "
  861. "clearing task state",
  862. instance.task_state, instance=instance)
  863. try:
  864. self._post_interrupted_snapshot_cleanup(context, instance)
  865. except Exception:
  866. # we don't want that an exception blocks the init_host
  867. LOG.exception('Failed to cleanup snapshot.', instance=instance)
  868. instance.task_state = None
  869. instance.save()
  870. if (instance.vm_state != vm_states.ERROR and
  871. instance.task_state in [task_states.RESIZE_PREP]):
  872. LOG.debug("Instance in transitional state %s at start-up "
  873. "clearing task state",
  874. instance['task_state'], instance=instance)
  875. instance.task_state = None
  876. instance.save()
  877. if instance.task_state == task_states.DELETING:
  878. try:
  879. LOG.info('Service started deleting the instance during '
  880. 'the previous run, but did not finish. Restarting'
  881. ' the deletion now.', instance=instance)
  882. instance.obj_load_attr('metadata')
  883. instance.obj_load_attr('system_metadata')
  884. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  885. context, instance.uuid)
  886. self._delete_instance(context, instance, bdms)
  887. except Exception:
  888. # we don't want that an exception blocks the init_host
  889. LOG.exception('Failed to complete a deletion',
  890. instance=instance)
  891. self._set_instance_obj_error_state(context, instance)
  892. return
  893. current_power_state = self._get_power_state(context, instance)
  894. try_reboot, reboot_type = self._retry_reboot(context, instance,
  895. current_power_state)
  896. if try_reboot:
  897. LOG.debug("Instance in transitional state (%(task_state)s) at "
  898. "start-up and power state is (%(power_state)s), "
  899. "triggering reboot",
  900. {'task_state': instance.task_state,
  901. 'power_state': current_power_state},
  902. instance=instance)
  903. # NOTE(mikal): if the instance was doing a soft reboot that got as
  904. # far as shutting down the instance but not as far as starting it
  905. # again, then we've just become a hard reboot. That means the
  906. # task state for the instance needs to change so that we're in one
  907. # of the expected task states for a hard reboot.
  908. if (instance.task_state in task_states.soft_reboot_states and
  909. reboot_type == 'HARD'):
  910. instance.task_state = task_states.REBOOT_PENDING_HARD
  911. instance.save()
  912. self.reboot_instance(context, instance, block_device_info=None,
  913. reboot_type=reboot_type)
  914. return
  915. elif (current_power_state == power_state.RUNNING and
  916. instance.task_state in [task_states.REBOOT_STARTED,
  917. task_states.REBOOT_STARTED_HARD,
  918. task_states.PAUSING,
  919. task_states.UNPAUSING]):
  920. LOG.warning("Instance in transitional state "
  921. "(%(task_state)s) at start-up and power state "
  922. "is (%(power_state)s), clearing task state",
  923. {'task_state': instance.task_state,
  924. 'power_state': current_power_state},
  925. instance=instance)
  926. instance.task_state = None
  927. instance.vm_state = vm_states.ACTIVE
  928. instance.save()
  929. elif (current_power_state == power_state.PAUSED and
  930. instance.task_state == task_states.UNPAUSING):
  931. LOG.warning("Instance in transitional state "
  932. "(%(task_state)s) at start-up and power state "
  933. "is (%(power_state)s), clearing task state "
  934. "and unpausing the instance",
  935. {'task_state': instance.task_state,
  936. 'power_state': current_power_state},
  937. instance=instance)
  938. try:
  939. self.unpause_instance(context, instance)
  940. except NotImplementedError:
  941. # Some virt driver didn't support pause and unpause
  942. pass
  943. except Exception:
  944. LOG.exception('Failed to unpause instance', instance=instance)
  945. return
  946. if instance.task_state == task_states.POWERING_OFF:
  947. try:
  948. LOG.debug("Instance in transitional state %s at start-up "
  949. "retrying stop request",
  950. instance.task_state, instance=instance)
  951. self.stop_instance(context, instance, True)
  952. except Exception:
  953. # we don't want that an exception blocks the init_host
  954. LOG.exception('Failed to stop instance', instance=instance)
  955. return
  956. if instance.task_state == task_states.POWERING_ON:
  957. try:
  958. LOG.debug("Instance in transitional state %s at start-up "
  959. "retrying start request",
  960. instance.task_state, instance=instance)
  961. self.start_instance(context, instance)
  962. except Exception:
  963. # we don't want that an exception blocks the init_host
  964. LOG.exception('Failed to start instance', instance=instance)
  965. return
  966. net_info = instance.get_network_info()
  967. try:
  968. self.driver.plug_vifs(instance, net_info)
  969. except NotImplementedError as e:
  970. LOG.debug(e, instance=instance)
  971. except exception.VirtualInterfacePlugException:
  972. # NOTE(mriedem): If we get here, it could be because the vif_type
  973. # in the cache is "binding_failed" or "unbound".
  974. # The periodic task _heal_instance_info_cache checks for this
  975. # condition. It should fix this by binding the ports again when
  976. # it gets to this instance.
  977. LOG.exception('Virtual interface plugging failed for instance. '
  978. 'The port binding:host_id may need to be manually '
  979. 'updated.', instance=instance)
  980. self._set_instance_obj_error_state(context, instance)
  981. return
  982. if instance.task_state == task_states.RESIZE_MIGRATING:
  983. # We crashed during resize/migration, so roll back for safety
  984. try:
  985. # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
  986. # not in system_metadata we default to True for backwards
  987. # compatibility
  988. power_on = (instance.system_metadata.get('old_vm_state') !=
  989. vm_states.STOPPED)
  990. block_dev_info = self._get_instance_block_device_info(context,
  991. instance)
  992. migration = objects.Migration.get_by_id_and_instance(
  993. context, instance.migration_context.migration_id,
  994. instance.uuid)
  995. self.driver.finish_revert_migration(context, instance,
  996. net_info, migration, block_dev_info, power_on)
  997. except Exception:
  998. LOG.exception('Failed to revert crashed migration',
  999. instance=instance)
  1000. finally:
  1001. LOG.info('Instance found in migrating state during '
  1002. 'startup. Resetting task_state',
  1003. instance=instance)
  1004. instance.task_state = None
  1005. instance.save()
  1006. if instance.task_state == task_states.MIGRATING:
  1007. # Live migration did not complete, but instance is on this
  1008. # host. Abort ongoing migration if still running and reset state.
  1009. self._reset_live_migration(context, instance)
  1010. db_state = instance.power_state
  1011. drv_state = self._get_power_state(context, instance)
  1012. expect_running = (db_state == power_state.RUNNING and
  1013. drv_state != db_state)
  1014. LOG.debug('Current state is %(drv_state)s, state in DB is '
  1015. '%(db_state)s.',
  1016. {'drv_state': drv_state, 'db_state': db_state},
  1017. instance=instance)
  1018. if expect_running and CONF.resume_guests_state_on_host_boot:
  1019. self._resume_guests_state(context, instance, net_info)
  1020. elif drv_state == power_state.RUNNING:
  1021. # VMwareAPI drivers will raise an exception
  1022. try:
  1023. self.driver.ensure_filtering_rules_for_instance(
  1024. instance, net_info)
  1025. except NotImplementedError:
  1026. LOG.debug('Hypervisor driver does not support '
  1027. 'firewall rules', instance=instance)
  1028. def _resume_guests_state(self, context, instance, net_info):
  1029. LOG.info('Rebooting instance after nova-compute restart.',
  1030. instance=instance)
  1031. block_device_info = \
  1032. self._get_instance_block_device_info(context, instance)
  1033. try:
  1034. self.driver.resume_state_on_host_boot(
  1035. context, instance, net_info, block_device_info)
  1036. except NotImplementedError:
  1037. LOG.warning('Hypervisor driver does not support '
  1038. 'resume guests', instance=instance)
  1039. except Exception:
  1040. # NOTE(vish): The instance failed to resume, so we set the
  1041. # instance to error and attempt to continue.
  1042. LOG.warning('Failed to resume instance',
  1043. instance=instance)
  1044. self._set_instance_obj_error_state(context, instance)
  1045. def _retry_reboot(self, context, instance, current_power_state):
  1046. current_task_state = instance.task_state
  1047. retry_reboot = False
  1048. reboot_type = compute_utils.get_reboot_type(current_task_state,
  1049. current_power_state)
  1050. pending_soft = (
  1051. current_task_state == task_states.REBOOT_PENDING and
  1052. instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
  1053. pending_hard = (
  1054. current_task_state == task_states.REBOOT_PENDING_HARD and
  1055. instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
  1056. started_not_running = (current_task_state in
  1057. [task_states.REBOOT_STARTED,
  1058. task_states.REBOOT_STARTED_HARD] and
  1059. current_power_state != power_state.RUNNING)
  1060. if pending_soft or pending_hard or started_not_running:
  1061. retry_reboot = True
  1062. return retry_reboot, reboot_type
  1063. def handle_lifecycle_event(self, event):
  1064. LOG.info("VM %(state)s (Lifecycle Event)",
  1065. {'state': event.get_name()},
  1066. instance_uuid=event.get_instance_uuid())
  1067. context = nova.context.get_admin_context(read_deleted='yes')
  1068. vm_power_state = None
  1069. event_transition = event.get_transition()
  1070. if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
  1071. vm_power_state = power_state.SHUTDOWN
  1072. elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
  1073. vm_power_state = power_state.RUNNING
  1074. elif event_transition in (
  1075. virtevent.EVENT_LIFECYCLE_PAUSED,
  1076. virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
  1077. virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
  1078. vm_power_state = power_state.PAUSED
  1079. elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
  1080. vm_power_state = power_state.RUNNING
  1081. elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
  1082. vm_power_state = power_state.SUSPENDED
  1083. else:
  1084. LOG.warning("Unexpected lifecycle event: %d", event_transition)
  1085. migrate_finish_statuses = {
  1086. # This happens on the source node and indicates live migration
  1087. # entered post-copy mode.
  1088. virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
  1089. # Suspended for offline migration.
  1090. virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
  1091. }
  1092. expected_attrs = []
  1093. if event_transition in migrate_finish_statuses:
  1094. # Join on info_cache since that's needed in migrate_instance_start.
  1095. expected_attrs.append('info_cache')
  1096. instance = objects.Instance.get_by_uuid(context,
  1097. event.get_instance_uuid(),
  1098. expected_attrs=expected_attrs)
  1099. # Note(lpetrut): The event may be delayed, thus not reflecting
  1100. # the current instance power state. In that case, ignore the event.
  1101. current_power_state = self._get_power_state(context, instance)
  1102. if current_power_state == vm_power_state:
  1103. LOG.debug('Synchronizing instance power state after lifecycle '
  1104. 'event "%(event)s"; current vm_state: %(vm_state)s, '
  1105. 'current task_state: %(task_state)s, current DB '
  1106. 'power_state: %(db_power_state)s, VM power_state: '
  1107. '%(vm_power_state)s',
  1108. {'event': event.get_name(),
  1109. 'vm_state': instance.vm_state,
  1110. 'task_state': instance.task_state,
  1111. 'db_power_state': instance.power_state,
  1112. 'vm_power_state': vm_power_state},
  1113. instance_uuid=instance.uuid)
  1114. self._sync_instance_power_state(context,
  1115. instance,
  1116. vm_power_state)
  1117. # The following checks are for live migration. We want to activate
  1118. # the port binding for the destination host before the live migration
  1119. # is resumed on the destination host in order to reduce network
  1120. # downtime. Otherwise the ports are bound to the destination host
  1121. # in post_live_migration_at_destination.
  1122. # TODO(danms): Explore options for using a different live migration
  1123. # specific callback for this instead of piggy-backing on the
  1124. # handle_lifecycle_event callback.
  1125. if (instance.task_state == task_states.MIGRATING and
  1126. event_transition in migrate_finish_statuses):
  1127. status = migrate_finish_statuses[event_transition]
  1128. try:
  1129. migration = objects.Migration.get_by_instance_and_status(
  1130. context, instance.uuid, status)
  1131. LOG.debug('Binding ports to destination host: %s',
  1132. migration.dest_compute, instance=instance)
  1133. # For neutron, migrate_instance_start will activate the
  1134. # destination host port bindings, if there are any created by
  1135. # conductor before live migration started.
  1136. self.network_api.migrate_instance_start(
  1137. context, instance, migration)
  1138. except exception.MigrationNotFoundByStatus:
  1139. LOG.warning("Unable to find migration record with status "
  1140. "'%s' for instance. Port binding will happen in "
  1141. "post live migration.", status, instance=instance)
  1142. def handle_events(self, event):
  1143. if isinstance(event, virtevent.LifecycleEvent):
  1144. try:
  1145. self.handle_lifecycle_event(event)
  1146. except exception.InstanceNotFound:
  1147. LOG.debug("Event %s arrived for non-existent instance. The "
  1148. "instance was probably deleted.", event)
  1149. else:
  1150. LOG.debug("Ignoring event %s", event)
  1151. def init_virt_events(self):
  1152. if CONF.workarounds.handle_virt_lifecycle_events:
  1153. self.driver.register_event_listener(self.handle_events)
  1154. else:
  1155. # NOTE(mriedem): If the _sync_power_states periodic task is
  1156. # disabled we should emit a warning in the logs.
  1157. if CONF.sync_power_state_interval < 0:
  1158. LOG.warning('Instance lifecycle events from the compute '
  1159. 'driver have been disabled. Note that lifecycle '
  1160. 'changes to an instance outside of the compute '
  1161. 'service will not be synchronized '
  1162. 'automatically since the _sync_power_states '
  1163. 'periodic task is also disabled.')
  1164. else:
  1165. LOG.info('Instance lifecycle events from the compute '
  1166. 'driver have been disabled. Note that lifecycle '
  1167. 'changes to an instance outside of the compute '
  1168. 'service will only be synchronized by the '
  1169. '_sync_power_states periodic task.')
  1170. def _get_nodes(self, context):
  1171. """Queried the ComputeNode objects from the DB that are reported by the
  1172. hypervisor.
  1173. :param context: the request context
  1174. :return: a dict of ComputeNode objects keyed by the UUID of the given
  1175. node.
  1176. """
  1177. nodes_by_uuid = {}
  1178. try:
  1179. node_names = self.driver.get_available_nodes()
  1180. except exception.VirtDriverNotReady:
  1181. LOG.warning(
  1182. "Virt driver is not ready. If this is the first time this "
  1183. "service is starting on this host, then you can ignore this "
  1184. "warning.")
  1185. return {}
  1186. for node_name in node_names:
  1187. try:
  1188. node = objects.ComputeNode.get_by_host_and_nodename(
  1189. context, self.host, node_name)
  1190. nodes_by_uuid[node.uuid] = node
  1191. except exception.ComputeHostNotFound:
  1192. LOG.warning(
  1193. "Compute node %s not found in the database. If this is "
  1194. "the first time this service is starting on this host, "
  1195. "then you can ignore this warning.", node_name)
  1196. return nodes_by_uuid
  1197. def init_host(self):
  1198. """Initialization for a standalone compute service."""
  1199. if CONF.pci.passthrough_whitelist:
  1200. # Simply loading the PCI passthrough whitelist will do a bunch of
  1201. # validation that would otherwise wait until the PciDevTracker is
  1202. # constructed when updating available resources for the compute
  1203. # node(s) in the resource tracker, effectively killing that task.
  1204. # So load up the whitelist when starting the compute service to
  1205. # flush any invalid configuration early so we can kill the service
  1206. # if the configuration is wrong.
  1207. whitelist.Whitelist(CONF.pci.passthrough_whitelist)
  1208. nova.conf.neutron.register_dynamic_opts(CONF)
  1209. # Override the number of concurrent disk operations allowed if the
  1210. # user has specified a limit.
  1211. if CONF.compute.max_concurrent_disk_ops != 0:
  1212. compute_utils.disk_ops_semaphore = \
  1213. eventlet.semaphore.BoundedSemaphore(
  1214. CONF.compute.max_concurrent_disk_ops)
  1215. self.driver.init_host(host=self.host)
  1216. context = nova.context.get_admin_context()
  1217. instances = objects.InstanceList.get_by_host(
  1218. context, self.host,
  1219. expected_attrs=['info_cache', 'metadata', 'numa_topology'])
  1220. if CONF.defer_iptables_apply:
  1221. self.driver.filter_defer_apply_on()
  1222. self.init_virt_events()
  1223. self._validate_pinning_configuration(instances)
  1224. # NOTE(gibi): At this point the compute_nodes of the resource tracker
  1225. # has not been populated yet so we cannot rely on the resource tracker
  1226. # here.
  1227. # NOTE(gibi): If ironic and vcenter virt driver slow start time
  1228. # becomes problematic here then we should consider adding a config
  1229. # option or a driver flag to tell us if we should thread
  1230. # _destroy_evacuated_instances and
  1231. # _error_out_instances_whose_build_was_interrupted out in the
  1232. # background on startup
  1233. nodes_by_uuid = self._get_nodes(context)
  1234. try:
  1235. # checking that instance was not already evacuated to other host
  1236. evacuated_instances = self._destroy_evacuated_instances(
  1237. context, nodes_by_uuid)
  1238. # Initialise instances on the host that are not evacuating
  1239. for instance in instances:
  1240. if instance.uuid not in evacuated_instances:
  1241. self._init_instance(context, instance)
  1242. # NOTE(gibi): collect all the instance uuids that is in some way
  1243. # was already handled above. Either by init_instance or by
  1244. # _destroy_evacuated_instances. This way we can limit the scope of
  1245. # the _error_out_instances_whose_build_was_interrupted call to look
  1246. # only for instances that have allocations on this node and not
  1247. # handled by the above calls.
  1248. already_handled = {instance.uuid for instance in instances}.union(
  1249. evacuated_instances)
  1250. self._error_out_instances_whose_build_was_interrupted(
  1251. context, already_handled, nodes_by_uuid.keys())
  1252. finally:
  1253. if CONF.defer_iptables_apply:
  1254. self.driver.filter_defer_apply_off()
  1255. if instances:
  1256. # We only send the instance info to the scheduler on startup
  1257. # if there is anything to send, otherwise this host might
  1258. # not be mapped yet in a cell and the scheduler may have
  1259. # issues dealing with the information. Later changes to
  1260. # instances on this host will update the scheduler, or the
  1261. # _sync_scheduler_instance_info periodic task will.
  1262. self._update_scheduler_instance_info(context, instances)
  1263. def _error_out_instances_whose_build_was_interrupted(
  1264. self, context, already_handled_instances, node_uuids):
  1265. """If there are instances in BUILDING state that are not
  1266. assigned to this host but have allocations in placement towards
  1267. this compute that means the nova-compute service was
  1268. restarted while those instances waited for the resource claim
  1269. to finish and the _set_instance_host_and_node() to update the
  1270. instance.host field. We need to push them to ERROR state here to
  1271. prevent keeping them in BUILDING state forever.
  1272. :param context: The request context
  1273. :param already_handled_instances: The set of instance UUIDs that the
  1274. host initialization process already handled in some way.
  1275. :param node_uuids: The list of compute node uuids handled by this
  1276. service
  1277. """
  1278. # Strategy:
  1279. # 1) Get the allocations from placement for our compute node(s)
  1280. # 2) Remove the already handled instances from the consumer list;
  1281. # they are either already initialized or need to be skipped.
  1282. # 3) Check which remaining consumer is an instance in BUILDING state
  1283. # and push it to ERROR state.
  1284. LOG.info(
  1285. "Looking for unclaimed instances stuck in BUILDING status for "
  1286. "nodes managed by this host")
  1287. for cn_uuid in node_uuids:
  1288. try:
  1289. f = self.reportclient.get_allocations_for_resource_provider
  1290. allocations = f(context, cn_uuid).allocations
  1291. except (exception.ResourceProviderAllocationRetrievalFailed,
  1292. keystone_exception.ClientException) as e:
  1293. LOG.error(
  1294. "Could not retrieve compute node resource provider %s and "
  1295. "therefore unable to error out any instances stuck in "
  1296. "BUILDING state. Error: %s", cn_uuid, six.text_type(e))
  1297. continue
  1298. not_handled_consumers = (set(allocations) -
  1299. already_handled_instances)
  1300. if not not_handled_consumers:
  1301. continue
  1302. filters = {
  1303. 'vm_state': vm_states.BUILDING,
  1304. 'uuid': not_handled_consumers
  1305. }
  1306. instances = objects.InstanceList.get_by_filters(
  1307. context, filters, expected_attrs=[])
  1308. for instance in instances:
  1309. LOG.debug(
  1310. "Instance spawn was interrupted before instance_claim, "
  1311. "setting instance to ERROR state", instance=instance)
  1312. self._set_instance_obj_error_state(
  1313. context, instance, clean_task_state=True)
  1314. def cleanup_host(self):
  1315. self.driver.register_event_listener(None)
  1316. self.instance_events.cancel_all_events()
  1317. self.driver.cleanup_host(host=self.host)
  1318. self._cleanup_live_migrations_in_pool()
  1319. def _cleanup_live_migrations_in_pool(self):
  1320. # Shutdown the pool so we don't get new requests.
  1321. self._live_migration_executor.shutdown(wait=False)
  1322. # For any queued migrations, cancel the migration and update
  1323. # its status.
  1324. for migration, future in self._waiting_live_migrations.values():
  1325. # If we got here before the Future was submitted then we need
  1326. # to move on since there isn't anything we can do.
  1327. if future is None:
  1328. continue
  1329. if future.cancel():
  1330. self._set_migration_status(migration, 'cancelled')
  1331. LOG.info('Successfully cancelled queued live migration.',
  1332. instance_uuid=migration.instance_uuid)
  1333. else:
  1334. LOG.warning('Unable to cancel live migration.',
  1335. instance_uuid=migration.instance_uuid)
  1336. self._waiting_live_migrations.clear()
  1337. def pre_start_hook(self):
  1338. """After the service is initialized, but before we fully bring
  1339. the service up by listening on RPC queues, make sure to update
  1340. our available resources (and indirectly our available nodes).
  1341. """
  1342. self.update_available_resource(nova.context.get_admin_context(),
  1343. startup=True)
  1344. def _get_power_state(self, context, instance):
  1345. """Retrieve the power state for the given instance."""
  1346. LOG.debug('Checking state', instance=instance)
  1347. try:
  1348. return self.driver.get_info(instance, use_cache=False).state
  1349. except exception.InstanceNotFound:
  1350. return power_state.NOSTATE
  1351. def get_console_topic(self, context):
  1352. """Retrieves the console host for a project on this host.
  1353. Currently this is just set in the flags for each compute host.
  1354. """
  1355. # TODO(mdragon): perhaps make this variable by console_type?
  1356. return '%s.%s' % (console_rpcapi.RPC_TOPIC, CONF.console_host)
  1357. @wrap_exception()
  1358. def get_console_pool_info(self, context, console_type):
  1359. return self.driver.get_console_pool_info(console_type)
  1360. @wrap_exception()
  1361. def refresh_instance_security_rules(self, context, instance):
  1362. """Tell the virtualization driver to refresh security rules for
  1363. an instance.
  1364. Passes straight through to the virtualization driver.
  1365. Synchronize the call because we may still be in the middle of
  1366. creating the instance.
  1367. """
  1368. @utils.synchronized(instance.uuid)
  1369. def _sync_refresh():
  1370. try:
  1371. return self.driver.refresh_instance_security_rules(instance)
  1372. except NotImplementedError:
  1373. LOG.debug('Hypervisor driver does not support '
  1374. 'security groups.', instance=instance)
  1375. return _sync_refresh()
  1376. def _await_block_device_map_created(self, context, vol_id):
  1377. # TODO(yamahata): creating volume simultaneously
  1378. # reduces creation time?
  1379. # TODO(yamahata): eliminate dumb polling
  1380. start = time.time()
  1381. retries = CONF.block_device_allocate_retries
  1382. # (1) if the configured value is 0, one attempt should be made
  1383. # (2) if the configured value is > 0, then the total number attempts
  1384. # is (retries + 1)
  1385. attempts = 1
  1386. if retries >= 1:
  1387. attempts = retries + 1
  1388. for attempt in range(1, attempts + 1):
  1389. volume = self.volume_api.get(context, vol_id)
  1390. volume_status = volume['status']
  1391. if volume_status not in ['creating', 'downloading']:
  1392. if volume_status == 'available':
  1393. return attempt
  1394. LOG.warning("Volume id: %(vol_id)s finished being "
  1395. "created but its status is %(vol_status)s.",
  1396. {'vol_id': vol_id,
  1397. 'vol_status': volume_status})
  1398. break
  1399. greenthread.sleep(CONF.block_device_allocate_retries_interval)
  1400. raise exception.VolumeNotCreated(volume_id=vol_id,
  1401. seconds=int(time.time() - start),
  1402. attempts=attempt,
  1403. volume_status=volume_status)
  1404. def _decode_files(self, injected_files):
  1405. """Base64 decode the list of files to inject."""
  1406. if not injected_files:
  1407. return []
  1408. def _decode(f):
  1409. path, contents = f
  1410. # Py3 raises binascii.Error instead of TypeError as in Py27
  1411. try:
  1412. decoded = base64.b64decode(contents)
  1413. return path, decoded
  1414. except (TypeError, binascii.Error):
  1415. raise exception.Base64Exception(path=path)
  1416. return [_decode(f) for f in injected_files]
  1417. def _validate_instance_group_policy(self, context, instance,
  1418. scheduler_hints):
  1419. # NOTE(russellb) Instance group policy is enforced by the scheduler.
  1420. # However, there is a race condition with the enforcement of
  1421. # the policy. Since more than one instance may be scheduled at the
  1422. # same time, it's possible that more than one instance with an
  1423. # anti-affinity policy may end up here. It's also possible that
  1424. # multiple instances with an affinity policy could end up on different
  1425. # hosts. This is a validation step to make sure that starting the
  1426. # instance here doesn't violate the policy.
  1427. group_hint = scheduler_hints.get('group')
  1428. if not group_hint:
  1429. return
  1430. # The RequestSpec stores scheduler_hints as key=list pairs so we need
  1431. # to check the type on the value and pull the single entry out. The
  1432. # API request schema validates that the 'group' hint is a single value.
  1433. if isinstance(group_hint, list):
  1434. group_hint = group_hint[0]
  1435. @utils.synchronized(group_hint)
  1436. def _do_validation(context, instance, group_hint):
  1437. group = objects.InstanceGroup.get_by_hint(context, group_hint)
  1438. if group.policy and 'anti-affinity' == group.policy:
  1439. instances_uuids = objects.InstanceList.get_uuids_by_host(
  1440. context, self.host)
  1441. ins_on_host = set(instances_uuids)
  1442. members = set(group.members)
  1443. # Determine the set of instance group members on this host
  1444. # which are not the instance in question. This is used to
  1445. # determine how many other members from the same anti-affinity
  1446. # group can be on this host.
  1447. members_on_host = ins_on_host & members - set([instance.uuid])
  1448. rules = group.rules
  1449. if rules and 'max_server_per_host' in rules:
  1450. max_server = rules['max_server_per_host']
  1451. else:
  1452. max_server = 1
  1453. if len(members_on_host) >= max_server:
  1454. msg = _("Anti-affinity instance group policy "
  1455. "was violated.")
  1456. raise exception.RescheduledException(
  1457. instance_uuid=instance.uuid,
  1458. reason=msg)
  1459. elif group.policy and 'affinity' == group.policy:
  1460. group_hosts = group.get_hosts(exclude=[instance.uuid])
  1461. if group_hosts and self.host not in group_hosts:
  1462. msg = _("Affinity instance group policy was violated.")
  1463. raise exception.RescheduledException(
  1464. instance_uuid=instance.uuid,
  1465. reason=msg)
  1466. if not CONF.workarounds.disable_group_policy_check_upcall:
  1467. _do_validation(context, instance, group_hint)
  1468. def _log_original_error(self, exc_info, instance_uuid):
  1469. LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
  1470. exc_info=exc_info)
  1471. @periodic_task.periodic_task
  1472. def _check_instance_build_time(self, context):
  1473. """Ensure that instances are not stuck in build."""
  1474. timeout = CONF.instance_build_timeout
  1475. if timeout == 0:
  1476. return
  1477. filters = {'vm_state': vm_states.BUILDING,
  1478. 'host': self.host}
  1479. building_insts = objects.InstanceList.get_by_filters(context,
  1480. filters, expected_attrs=[], use_slave=True)
  1481. for instance in building_insts:
  1482. if timeutils.is_older_than(instance.created_at, timeout):
  1483. self._set_instance_obj_error_state(context, instance)
  1484. LOG.warning("Instance build timed out. Set to error "
  1485. "state.", instance=instance)
  1486. def _check_instance_exists(self, context, instance):
  1487. """Ensure an instance with the same name is not already present."""
  1488. if self.driver.instance_exists(instance):
  1489. raise exception.InstanceExists(name=instance.name)
  1490. def _allocate_network_async(self, context, instance, requested_networks,
  1491. security_groups, is_vpn,
  1492. resource_provider_mapping):
  1493. """Method used to allocate networks in the background.
  1494. Broken out for testing.
  1495. """
  1496. # First check to see if we're specifically not supposed to allocate
  1497. # networks because if so, we can exit early.
  1498. if requested_networks and requested_networks.no_allocate:
  1499. LOG.debug("Not allocating networking since 'none' was specified.",
  1500. instance=instance)
  1501. return network_model.NetworkInfo([])
  1502. LOG.debug("Allocating IP information in the background.",
  1503. instance=instance)
  1504. retries = CONF.network_allocate_retries
  1505. attempts = retries + 1
  1506. retry_time = 1
  1507. bind_host_id = self.driver.network_binding_host_id(context, instance)
  1508. for attempt in range(1, attempts + 1):
  1509. try:
  1510. nwinfo = self.network_api.allocate_for_instance(
  1511. context, instance, vpn=is_vpn,
  1512. requested_networks=requested_networks,
  1513. security_groups=security_groups,
  1514. bind_host_id=bind_host_id,
  1515. resource_provider_mapping=resource_provider_mapping)
  1516. LOG.debug('Instance network_info: |%s|', nwinfo,
  1517. instance=instance)
  1518. instance.system_metadata['network_allocated'] = 'True'
  1519. # NOTE(JoshNang) do not save the instance here, as it can cause
  1520. # races. The caller shares a reference to instance and waits
  1521. # for this async greenthread to finish before calling
  1522. # instance.save().
  1523. return nwinfo
  1524. except Exception:
  1525. exc_info = sys.exc_info()
  1526. log_info = {'attempt': attempt,
  1527. 'attempts': attempts}
  1528. if attempt == attempts:
  1529. LOG.exception('Instance failed network setup '
  1530. 'after %(attempts)d attempt(s)',
  1531. log_info)
  1532. six.reraise(*exc_info)
  1533. LOG.warning('Instance failed network setup '
  1534. '(attempt %(attempt)d of %(attempts)d)',
  1535. log_info, instance=instance)
  1536. time.sleep(retry_time)
  1537. retry_time *= 2
  1538. if retry_time > 30:
  1539. retry_time = 30
  1540. # Not reached.
  1541. def _build_networks_for_instance(self, context, instance,
  1542. requested_networks, security_groups, resource_provider_mapping):
  1543. # If we're here from a reschedule the network may already be allocated.
  1544. if strutils.bool_from_string(
  1545. instance.system_metadata.get('network_allocated', 'False')):
  1546. # NOTE(alex_xu): The network_allocated is True means the network
  1547. # resource already allocated at previous scheduling, and the
  1548. # network setup is cleanup at previous. After rescheduling, the
  1549. # network resource need setup on the new host.
  1550. self.network_api.setup_instance_network_on_host(
  1551. context, instance, instance.host)
  1552. return self.network_api.get_instance_nw_info(context, instance)
  1553. if not self.is_neutron_security_groups:
  1554. security_groups = []
  1555. network_info = self._allocate_network(context, instance,
  1556. requested_networks, security_groups,
  1557. resource_provider_mapping)
  1558. return network_info
  1559. def _allocate_network(self, context, instance, requested_networks,
  1560. security_groups, resource_provider_mapping):
  1561. """Start network allocation asynchronously. Return an instance
  1562. of NetworkInfoAsyncWrapper that can be used to retrieve the
  1563. allocated networks when the operation has finished.
  1564. """
  1565. # NOTE(comstud): Since we're allocating networks asynchronously,
  1566. # this task state has little meaning, as we won't be in this
  1567. # state for very long.
  1568. instance.vm_state = vm_states.BUILDING
  1569. instance.task_state = task_states.NETWORKING
  1570. instance.save(expected_task_state=[None])
  1571. is_vpn = False
  1572. return network_model.NetworkInfoAsyncWrapper(
  1573. self._allocate_network_async, context, instance,
  1574. requested_networks, security_groups, is_vpn,
  1575. resource_provider_mapping)
  1576. def _default_root_device_name(self, instance, image_meta, root_bdm):
  1577. """Gets a default root device name from the driver.
  1578. :param nova.objects.Instance instance:
  1579. The instance for which to get the root device name.
  1580. :param nova.objects.ImageMeta image_meta:
  1581. The metadata of the image of the instance.
  1582. :param nova.objects.BlockDeviceMapping root_bdm:
  1583. The description of the root device.
  1584. :returns: str -- The default root device name.
  1585. :raises: InternalError, TooManyDiskDevices
  1586. """
  1587. try:
  1588. return self.driver.default_root_device_name(instance,
  1589. image_meta,
  1590. root_bdm)
  1591. except NotImplementedError:
  1592. return compute_utils.get_next_device_name(instance, [])
  1593. def _default_device_names_for_instance(self, instance,
  1594. root_device_name,
  1595. *block_device_lists):
  1596. """Default the missing device names in the BDM from the driver.
  1597. :param nova.objects.Instance instance:
  1598. The instance for which to get default device names.
  1599. :param str root_device_name: The root device name.
  1600. :param list block_device_lists: List of block device mappings.
  1601. :returns: None
  1602. :raises: InternalError, TooManyDiskDevices
  1603. """
  1604. try:
  1605. self.driver.default_device_names_for_instance(instance,
  1606. root_device_name,
  1607. *block_device_lists)
  1608. except NotImplementedError:
  1609. compute_utils.default_device_names_for_instance(
  1610. instance, root_device_name, *block_device_lists)
  1611. def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
  1612. """Get the next device name from the driver, based on the BDM.
  1613. :param nova.objects.Instance instance:
  1614. The instance whose volume is requesting a device name.
  1615. :param nova.objects.BlockDeviceMappingList bdms:
  1616. The block device mappings for the instance.
  1617. :param nova.objects.BlockDeviceMapping block_device_obj:
  1618. A block device mapping containing info about the requested block
  1619. device.
  1620. :returns: The next device name.
  1621. :raises: InternalError, TooManyDiskDevices
  1622. """
  1623. # NOTE(ndipanov): Copy obj to avoid changing the original
  1624. block_device_obj = block_device_obj.obj_clone()
  1625. try:
  1626. return self.driver.get_device_name_for_instance(
  1627. instance, bdms, block_device_obj)
  1628. except NotImplementedError:
  1629. return compute_utils.get_device_name_for_instance(
  1630. instance, bdms, block_device_obj.get("device_name"))
  1631. def _default_block_device_names(self, instance, image_meta, block_devices):
  1632. """Verify that all the devices have the device_name set. If not,
  1633. provide a default name.
  1634. It also ensures that there is a root_device_name and is set to the
  1635. first block device in the boot sequence (boot_index=0).
  1636. """
  1637. root_bdm = block_device.get_root_bdm(block_devices)
  1638. if not root_bdm:
  1639. return
  1640. # Get the root_device_name from the root BDM or the instance
  1641. root_device_name = None
  1642. update_root_bdm = False
  1643. if root_bdm.device_name:
  1644. root_device_name = root_bdm.device_name
  1645. instance.root_device_name = root_device_name
  1646. elif instance.root_device_name:
  1647. root_device_name = instance.root_device_name
  1648. root_bdm.device_name = root_device_name
  1649. update_root_bdm = True
  1650. else:
  1651. root_device_name = self._default_root_device_name(instance,
  1652. image_meta,
  1653. root_bdm)
  1654. instance.root_device_name = root_device_name
  1655. root_bdm.device_name = root_device_name
  1656. update_root_bdm = True
  1657. if update_root_bdm:
  1658. root_bdm.save()
  1659. ephemerals = list(filter(block_device.new_format_is_ephemeral,
  1660. block_devices))
  1661. swap = list(filter(block_device.new_format_is_swap,
  1662. block_devices))
  1663. block_device_mapping = list(filter(
  1664. driver_block_device.is_block_device_mapping, block_devices))
  1665. self._default_device_names_for_instance(instance,
  1666. root_device_name,
  1667. ephemerals,
  1668. swap,
  1669. block_device_mapping)
  1670. def _block_device_info_to_legacy(self, block_device_info):
  1671. """Convert BDI to the old format for drivers that need it."""
  1672. if self.use_legacy_block_device_info:
  1673. ephemerals = driver_block_device.legacy_block_devices(
  1674. driver.block_device_info_get_ephemerals(block_device_info))
  1675. mapping = driver_block_device.legacy_block_devices(
  1676. driver.block_device_info_get_mapping(block_device_info))
  1677. swap = block_device_info['swap']
  1678. if swap:
  1679. swap = swap.legacy()
  1680. block_device_info.update({
  1681. 'ephemerals': ephemerals,
  1682. 'swap': swap,
  1683. 'block_device_mapping': mapping})
  1684. def _add_missing_dev_names(self, bdms, instance):
  1685. for bdm in bdms:
  1686. if bdm.device_name is not None:
  1687. continue
  1688. device_name = self._get_device_name_for_instance(instance,
  1689. bdms, bdm)
  1690. values = {'device_name': device_name}
  1691. bdm.update(values)
  1692. bdm.save()
  1693. def _prep_block_device(self, context, instance, bdms):
  1694. """Set up the block device for an instance with error logging."""
  1695. try:
  1696. self._add_missing_dev_names(bdms, instance)
  1697. block_device_info = driver.get_block_device_info(instance, bdms)
  1698. mapping = driver.block_device_info_get_mapping(block_device_info)
  1699. driver_block_device.attach_block_devices(
  1700. mapping, context, instance, self.volume_api, self.driver,
  1701. wait_func=self._await_block_device_map_created)
  1702. self._block_device_info_to_legacy(block_device_info)
  1703. return block_device_info
  1704. except exception.OverQuota as e:
  1705. LOG.warning('Failed to create block device for instance due'
  1706. ' to exceeding volume related resource quota.'
  1707. ' Error: %s', e.message, instance=instance)
  1708. raise
  1709. except Exception as ex:
  1710. LOG.exception('Instance failed block device setup',
  1711. instance=instance)
  1712. # InvalidBDM will eventually result in a BuildAbortException when
  1713. # booting from volume, and will be recorded as an instance fault.
  1714. # Maintain the original exception message which most likely has
  1715. # useful details which the standard InvalidBDM error message lacks.
  1716. raise exception.InvalidBDM(six.text_type(ex))
  1717. def _update_instance_after_spawn(self, context, instance,
  1718. vm_state=vm_states.ACTIVE):
  1719. instance.power_state = self._get_power_state(context, instance)
  1720. instance.vm_state = vm_state
  1721. instance.task_state = None
  1722. # NOTE(sean-k-mooney): configdrive.update_instance checks
  1723. # instance.launched_at to determine if it is the first or
  1724. # subsequent spawn of an instance. We need to call update_instance
  1725. # first before setting instance.launched_at or instance.config_drive
  1726. # will never be set to true based on the value of force_config_drive.
  1727. # As a result the config drive will be lost on a hard reboot of the
  1728. # instance even when force_config_drive=true. see bug #1835822.
  1729. configdrive.update_instance(instance)
  1730. instance.launched_at = timeutils.utcnow()
  1731. def _update_scheduler_instance_info(self, context, instance):
  1732. """Sends an InstanceList with created or updated Instance objects to
  1733. the Scheduler client.
  1734. In the case of init_host, the value passed will already be an
  1735. InstanceList. Other calls will send individual Instance objects that
  1736. have been created or resized. In this case, we create an InstanceList
  1737. object containing that Instance.
  1738. """
  1739. if not self.send_instance_updates:
  1740. return
  1741. if isinstance(instance, obj_instance.Instance):
  1742. instance = objects.InstanceList(objects=[instance])
  1743. context = context.elevated()
  1744. self.query_client.update_instance_info(context, self.host,
  1745. instance)
  1746. def _delete_scheduler_instance_info(self, context, instance_uuid):
  1747. """Sends the uuid of the deleted Instance to the Scheduler client."""
  1748. if not self.send_instance_updates:
  1749. return
  1750. context = context.elevated()
  1751. self.query_client.delete_instance_info(context, self.host,
  1752. instance_uuid)
  1753. @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
  1754. def _sync_scheduler_instance_info(self, context):
  1755. if not self.send_instance_updates:
  1756. return
  1757. context = context.elevated()
  1758. instances = objects.InstanceList.get_by_host(context, self.host,
  1759. expected_attrs=[],
  1760. use_slave=True)
  1761. uuids = [instance.uuid for instance in instances]
  1762. self.query_client.sync_instance_info(context, self.host, uuids)
  1763. def _notify_about_instance_usage(self, context, instance, event_suffix,
  1764. network_info=None, extra_usage_info=None,
  1765. fault=None):
  1766. compute_utils.notify_about_instance_usage(
  1767. self.notifier, context, instance, event_suffix,
  1768. network_info=network_info,
  1769. extra_usage_info=extra_usage_info, fault=fault)
  1770. def _deallocate_network(self, context, instance,
  1771. requested_networks=None):
  1772. # If we were told not to allocate networks let's save ourselves
  1773. # the trouble of calling the network API.
  1774. if requested_networks and requested_networks.no_allocate:
  1775. LOG.debug("Skipping network deallocation for instance since "
  1776. "networking was not requested.", instance=instance)
  1777. return
  1778. LOG.debug('Deallocating network for instance', instance=instance)
  1779. with timeutils.StopWatch() as timer:
  1780. self.network_api.deallocate_for_instance(
  1781. context, instance, requested_networks=requested_networks)
  1782. # nova-network does an rpc call so we're OK tracking time spent here
  1783. LOG.info('Took %0.2f seconds to deallocate network for instance.',
  1784. timer.elapsed(), instance=instance)
  1785. def _get_instance_block_device_info(self, context, instance,
  1786. refresh_conn_info=False,
  1787. bdms=None):
  1788. """Transform block devices to the driver block_device format."""
  1789. if bdms is None:
  1790. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  1791. context, instance.uuid)
  1792. block_device_info = driver.get_block_device_info(instance, bdms)
  1793. if not refresh_conn_info:
  1794. # if the block_device_mapping has no value in connection_info
  1795. # (returned as None), don't include in the mapping
  1796. block_device_info['block_device_mapping'] = [
  1797. bdm for bdm in driver.block_device_info_get_mapping(
  1798. block_device_info)
  1799. if bdm.get('connection_info')]
  1800. else:
  1801. driver_block_device.refresh_conn_infos(
  1802. driver.block_device_info_get_mapping(block_device_info),
  1803. context, instance, self.volume_api, self.driver)
  1804. self._block_device_info_to_legacy(block_device_info)
  1805. return block_device_info
  1806. def _build_failed(self, node):
  1807. if CONF.compute.consecutive_build_service_disable_threshold:
  1808. # NOTE(danms): Update our counter, but wait for the next
  1809. # update_available_resource() periodic to flush it to the DB
  1810. self.rt.build_failed(node)
  1811. def _build_succeeded(self, node):
  1812. self.rt.build_succeeded(node)
  1813. @wrap_exception()
  1814. @reverts_task_state
  1815. @wrap_instance_fault
  1816. def build_and_run_instance(self, context, instance, image, request_spec,
  1817. filter_properties, admin_password=None,
  1818. injected_files=None, requested_networks=None,
  1819. security_groups=None, block_device_mapping=None,
  1820. node=None, limits=None, host_list=None):
  1821. @utils.synchronized(instance.uuid)
  1822. def _locked_do_build_and_run_instance(*args, **kwargs):
  1823. # NOTE(danms): We grab the semaphore with the instance uuid
  1824. # locked because we could wait in line to build this instance
  1825. # for a while and we want to make sure that nothing else tries
  1826. # to do anything with this instance while we wait.
  1827. with self._build_semaphore:
  1828. try:
  1829. result = self._do_build_and_run_instance(*args, **kwargs)
  1830. except Exception:
  1831. # NOTE(mriedem): This should really only happen if
  1832. # _decode_files in _do_build_and_run_instance fails, and
  1833. # that's before a guest is spawned so it's OK to remove
  1834. # allocations for the instance for this node from Placement
  1835. # below as there is no guest consuming resources anyway.
  1836. # The _decode_files case could be handled more specifically
  1837. # but that's left for another day.
  1838. result = build_results.FAILED
  1839. raise
  1840. finally:
  1841. if result == build_results.FAILED:
  1842. # Remove the allocation records from Placement for the
  1843. # instance if the build failed. The instance.host is
  1844. # likely set to None in _do_build_and_run_instance
  1845. # which means if the user deletes the instance, it
  1846. # will be deleted in the API, not the compute service.
  1847. # Setting the instance.host to None in
  1848. # _do_build_and_run_instance means that the
  1849. # ResourceTracker will no longer consider this instance
  1850. # to be claiming resources against it, so we want to
  1851. # reflect that same thing in Placement. No need to
  1852. # call this for a reschedule, as the allocations will
  1853. # have already been removed in
  1854. # self._do_build_and_run_instance().
  1855. self.reportclient.delete_allocation_for_instance(
  1856. context, instance.uuid)
  1857. if result in (build_results.FAILED,
  1858. build_results.RESCHEDULED):
  1859. self._build_failed(node)
  1860. else:
  1861. self._build_succeeded(node)
  1862. # NOTE(danms): We spawn here to return the RPC worker thread back to
  1863. # the pool. Since what follows could take a really long time, we don't
  1864. # want to tie up RPC workers.
  1865. utils.spawn_n(_locked_do_build_and_run_instance,
  1866. context, instance, image, request_spec,
  1867. filter_properties, admin_password, injected_files,
  1868. requested_networks, security_groups,
  1869. block_device_mapping, node, limits, host_list)
  1870. def _check_device_tagging(self, requested_networks, block_device_mapping):
  1871. tagging_requested = False
  1872. if requested_networks:
  1873. for net in requested_networks:
  1874. if 'tag' in net and net.tag is not None:
  1875. tagging_requested = True
  1876. break
  1877. if block_device_mapping and not tagging_requested:
  1878. for bdm in block_device_mapping:
  1879. if 'tag' in bdm and bdm.tag is not None:
  1880. tagging_requested = True
  1881. break
  1882. if (tagging_requested and
  1883. not self.driver.capabilities.get('supports_device_tagging',
  1884. False)):
  1885. raise exception.BuildAbortException('Attempt to boot guest with '
  1886. 'tagged devices on host that '
  1887. 'does not support tagging.')
  1888. def _check_trusted_certs(self, instance):
  1889. if (instance.trusted_certs and
  1890. not self.driver.capabilities.get('supports_trusted_certs',
  1891. False)):
  1892. raise exception.BuildAbortException(
  1893. 'Trusted image certificates provided on host that does not '
  1894. 'support certificate validation.')
  1895. @hooks.add_hook('build_instance')
  1896. @wrap_exception()
  1897. @reverts_task_state
  1898. @wrap_instance_event(prefix='compute')
  1899. @wrap_instance_fault
  1900. def _do_build_and_run_instance(self, context, instance, image,
  1901. request_spec, filter_properties, admin_password, injected_files,
  1902. requested_networks, security_groups, block_device_mapping,
  1903. node=None, limits=None, host_list=None):
  1904. try:
  1905. LOG.debug('Starting instance...', instance=instance)
  1906. instance.vm_state = vm_states.BUILDING
  1907. instance.task_state = None
  1908. instance.save(expected_task_state=
  1909. (task_states.SCHEDULING, None))
  1910. except exception.InstanceNotFound:
  1911. msg = 'Instance disappeared before build.'
  1912. LOG.debug(msg, instance=instance)
  1913. return build_results.FAILED
  1914. except exception.UnexpectedTaskStateError as e:
  1915. LOG.debug(e.format_message(), instance=instance)
  1916. return build_results.FAILED
  1917. # b64 decode the files to inject:
  1918. decoded_files = self._decode_files(injected_files)
  1919. if limits is None:
  1920. limits = {}
  1921. if node is None:
  1922. node = self._get_nodename(instance, refresh=True)
  1923. try:
  1924. with timeutils.StopWatch() as timer:
  1925. self._build_and_run_instance(context, instance, image,
  1926. decoded_files, admin_password, requested_networks,
  1927. security_groups, block_device_mapping, node, limits,
  1928. filter_properties, request_spec)
  1929. LOG.info('Took %0.2f seconds to build instance.',
  1930. timer.elapsed(), instance=instance)
  1931. return build_results.ACTIVE
  1932. except exception.RescheduledException as e:
  1933. retry = filter_properties.get('retry')
  1934. if not retry:
  1935. # no retry information, do not reschedule.
  1936. LOG.debug("Retry info not present, will not reschedule",
  1937. instance=instance)
  1938. self._cleanup_allocated_networks(context, instance,
  1939. requested_networks)
  1940. self._cleanup_volumes(context, instance,
  1941. block_device_mapping, raise_exc=False)
  1942. compute_utils.add_instance_fault_from_exc(context,
  1943. instance, e, sys.exc_info(),
  1944. fault_message=e.kwargs['reason'])
  1945. self._nil_out_instance_obj_host_and_node(instance)
  1946. self._set_instance_obj_error_state(context, instance,
  1947. clean_task_state=True)
  1948. return build_results.FAILED
  1949. LOG.debug(e.format_message(), instance=instance)
  1950. # This will be used for logging the exception
  1951. retry['exc'] = traceback.format_exception(*sys.exc_info())
  1952. # This will be used for setting the instance fault message
  1953. retry['exc_reason'] = e.kwargs['reason']
  1954. # NOTE(comstud): Deallocate networks if the driver wants
  1955. # us to do so.
  1956. # NOTE(mriedem): Always deallocate networking when using Neutron.
  1957. # This is to unbind any ports that the user supplied in the server
  1958. # create request, or delete any ports that nova created which were
  1959. # meant to be bound to this host. This check intentionally bypasses
  1960. # the result of deallocate_networks_on_reschedule because the
  1961. # default value in the driver is False, but that method was really
  1962. # only meant for Ironic and should be removed when nova-network is
  1963. # removed (since is_neutron() will then always be True).
  1964. # NOTE(vladikr): SR-IOV ports should be deallocated to
  1965. # allow new sriov pci devices to be allocated on a new host.
  1966. # Otherwise, if devices with pci addresses are already allocated
  1967. # on the destination host, the instance will fail to spawn.
  1968. # info_cache.network_info should be present at this stage.
  1969. if (self.driver.deallocate_networks_on_reschedule(instance) or
  1970. utils.is_neutron() or
  1971. self.deallocate_sriov_ports_on_reschedule(instance)):
  1972. self._cleanup_allocated_networks(context, instance,
  1973. requested_networks)
  1974. else:
  1975. # NOTE(alex_xu): Network already allocated and we don't
  1976. # want to deallocate them before rescheduling. But we need
  1977. # to cleanup those network resources setup on this host before
  1978. # rescheduling.
  1979. self.network_api.cleanup_instance_network_on_host(
  1980. context, instance, self.host)
  1981. self._nil_out_instance_obj_host_and_node(instance)
  1982. instance.task_state = task_states.SCHEDULING
  1983. instance.save()
  1984. # The instance will have already claimed resources from this host
  1985. # before this build was attempted. Now that it has failed, we need
  1986. # to unclaim those resources before casting to the conductor, so
  1987. # that if there are alternate hosts available for a retry, it can
  1988. # claim resources on that new host for the instance.
  1989. self.reportclient.delete_allocation_for_instance(context,
  1990. instance.uuid)
  1991. self.compute_task_api.build_instances(context, [instance],
  1992. image, filter_properties, admin_password,
  1993. injected_files, requested_networks, security_groups,
  1994. block_device_mapping, request_spec=request_spec,
  1995. host_lists=[host_list])
  1996. return build_results.RESCHEDULED
  1997. except (exception.InstanceNotFound,
  1998. exception.UnexpectedDeletingTaskStateError):
  1999. msg = 'Instance disappeared during build.'
  2000. LOG.debug(msg, instance=instance)
  2001. self._cleanup_allocated_networks(context, instance,
  2002. requested_networks)
  2003. return build_results.FAILED
  2004. except Exception as e:
  2005. if isinstance(e, exception.BuildAbortException):
  2006. LOG.error(e.format_message(), instance=instance)
  2007. else:
  2008. # Should not reach here.
  2009. LOG.exception('Unexpected build failure, not rescheduling '
  2010. 'build.', instance=instance)
  2011. self._cleanup_allocated_networks(context, instance,
  2012. requested_networks)
  2013. self._cleanup_volumes(context, instance,
  2014. block_device_mapping, raise_exc=False)
  2015. compute_utils.add_instance_fault_from_exc(context, instance,
  2016. e, sys.exc_info())
  2017. self._nil_out_instance_obj_host_and_node(instance)
  2018. self._set_instance_obj_error_state(context, instance,
  2019. clean_task_state=True)
  2020. return build_results.FAILED
  2021. def deallocate_sriov_ports_on_reschedule(self, instance):
  2022. """Determine if networks are needed to be deallocated before reschedule
  2023. Check the cached network info for any assigned SR-IOV ports.
  2024. SR-IOV ports should be deallocated prior to rescheduling
  2025. in order to allow new sriov pci devices to be allocated on a new host.
  2026. """
  2027. info_cache = instance.info_cache
  2028. def _has_sriov_port(vif):
  2029. return vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV
  2030. if (info_cache and info_cache.network_info):
  2031. for vif in info_cache.network_info:
  2032. if _has_sriov_port(vif):
  2033. return True
  2034. return False
  2035. @staticmethod
  2036. def _get_scheduler_hints(filter_properties, request_spec=None):
  2037. """Helper method to get scheduler hints.
  2038. This method prefers to get the hints out of the request spec, but that
  2039. might not be provided. Conductor will pass request_spec down to the
  2040. first compute chosen for a build but older computes will not pass
  2041. the request_spec to conductor's build_instances method for a
  2042. a reschedule, so if we're on a host via a retry, request_spec may not
  2043. be provided so we need to fallback to use the filter_properties
  2044. to get scheduler hints.
  2045. """
  2046. hints = {}
  2047. if request_spec is not None and 'scheduler_hints' in request_spec:
  2048. hints = request_spec.scheduler_hints
  2049. if not hints:
  2050. hints = filter_properties.get('scheduler_hints') or {}
  2051. return hints
  2052. @staticmethod
  2053. def _get_request_group_mapping(request_spec):
  2054. """Return request group resource - provider mapping. This is currently
  2055. used for Neutron ports that have resource request due to the port
  2056. having QoS minimum bandwidth policy rule attached.
  2057. :param request_spec: A RequestSpec object
  2058. :returns: A dict keyed by RequestGroup requester_id, currently Neutron
  2059. port_id, to resource provider UUID that provides resource for that
  2060. RequestGroup.
  2061. """
  2062. if (request_spec and
  2063. 'requested_resources' in request_spec and
  2064. request_spec.requested_resources is not None):
  2065. return {
  2066. group.requester_id: group.provider_uuids
  2067. for group in request_spec.requested_resources
  2068. }
  2069. else:
  2070. return None
  2071. def _update_pci_request_spec_with_allocated_interface_name(
  2072. self, context, instance, request_group_resource_providers_mapping):
  2073. if not instance.pci_requests:
  2074. return
  2075. def needs_update(pci_request, mapping):
  2076. return (pci_request.requester_id and
  2077. pci_request.requester_id in mapping)
  2078. modified = False
  2079. for pci_request in instance.pci_requests.requests:
  2080. if needs_update(
  2081. pci_request, request_group_resource_providers_mapping):
  2082. provider_uuids = request_group_resource_providers_mapping[
  2083. pci_request.requester_id]
  2084. if len(provider_uuids) != 1:
  2085. reason = (
  2086. 'Allocating resources from more than one resource '
  2087. 'providers %(providers)s for a single pci request '
  2088. '%(requester)s is not supported.' %
  2089. {'providers': provider_uuids,
  2090. 'requester': pci_request.requester_id})
  2091. raise exception.BuildAbortException(
  2092. instance_uuid=instance.uuid,
  2093. reason=reason)
  2094. dev_rp_name = self.reportclient.get_resource_provider_name(
  2095. context,
  2096. provider_uuids[0])
  2097. # NOTE(gibi): the device RP name reported by neutron is
  2098. # structured like <hostname>:<agentname>:<interfacename>
  2099. rp_name_pieces = dev_rp_name.split(':')
  2100. if len(rp_name_pieces) != 3:
  2101. reason = (
  2102. 'Resource provider %(provider)s used to allocate '
  2103. 'resources for the pci request %(requester)s does not '
  2104. 'have properly formatted name. Expected name format '
  2105. 'is <hostname>:<agentname>:<interfacename>, but got '
  2106. '%(provider_name)s' %
  2107. {'provider': provider_uuids[0],
  2108. 'requester': pci_request.requester_id,
  2109. 'provider_name': dev_rp_name})
  2110. raise exception.BuildAbortException(
  2111. instance_uuid=instance.uuid,
  2112. reason=reason)
  2113. for spec in pci_request.spec:
  2114. spec['parent_ifname'] = rp_name_pieces[2]
  2115. modified = True
  2116. if modified:
  2117. instance.save()
  2118. def _build_and_run_instance(self, context, instance, image, injected_files,
  2119. admin_password, requested_networks, security_groups,
  2120. block_device_mapping, node, limits, filter_properties,
  2121. request_spec=None):
  2122. image_name = image.get('name')
  2123. self._notify_about_instance_usage(context, instance, 'create.start',
  2124. extra_usage_info={'image_name': image_name})
  2125. compute_utils.notify_about_instance_create(
  2126. context, instance, self.host,
  2127. phase=fields.NotificationPhase.START,
  2128. bdms=block_device_mapping)
  2129. # NOTE(mikal): cache the keystone roles associated with the instance
  2130. # at boot time for later reference
  2131. instance.system_metadata.update(
  2132. {'boot_roles': ','.join(context.roles)})
  2133. self._check_device_tagging(requested_networks, block_device_mapping)
  2134. self._check_trusted_certs(instance)
  2135. request_group_resource_providers_mapping = \
  2136. self._get_request_group_mapping(request_spec)
  2137. if request_group_resource_providers_mapping:
  2138. self._update_pci_request_spec_with_allocated_interface_name(
  2139. context, instance, request_group_resource_providers_mapping)
  2140. # TODO(Luyao) cut over to get_allocs_for_consumer
  2141. allocs = self.reportclient.get_allocations_for_consumer(
  2142. context, instance.uuid)
  2143. try:
  2144. scheduler_hints = self._get_scheduler_hints(filter_properties,
  2145. request_spec)
  2146. with self.rt.instance_claim(context, instance, node, allocs,
  2147. limits):
  2148. # NOTE(russellb) It's important that this validation be done
  2149. # *after* the resource tracker instance claim, as that is where
  2150. # the host is set on the instance.
  2151. self._validate_instance_group_policy(context, instance,
  2152. scheduler_hints)
  2153. image_meta = objects.ImageMeta.from_dict(image)
  2154. request_group_resource_providers_mapping = \
  2155. self._get_request_group_mapping(request_spec)
  2156. with self._build_resources(context, instance,
  2157. requested_networks, security_groups, image_meta,
  2158. block_device_mapping,
  2159. request_group_resource_providers_mapping) as resources:
  2160. instance.vm_state = vm_states.BUILDING
  2161. instance.task_state = task_states.SPAWNING
  2162. # NOTE(JoshNang) This also saves the changes to the
  2163. # instance from _allocate_network_async, as they aren't
  2164. # saved in that function to prevent races.
  2165. instance.save(expected_task_state=
  2166. task_states.BLOCK_DEVICE_MAPPING)
  2167. block_device_info = resources['block_device_info']
  2168. network_info = resources['network_info']
  2169. LOG.debug('Start spawning the instance on the hypervisor.',
  2170. instance=instance)
  2171. with timeutils.StopWatch() as timer:
  2172. self.driver.spawn(context, instance, image_meta,
  2173. injected_files, admin_password,
  2174. allocs, network_info=network_info,
  2175. block_device_info=block_device_info)
  2176. LOG.info('Took %0.2f seconds to spawn the instance on '
  2177. 'the hypervisor.', timer.elapsed(),
  2178. instance=instance)
  2179. except (exception.InstanceNotFound,
  2180. exception.UnexpectedDeletingTaskStateError) as e:
  2181. with excutils.save_and_reraise_exception():
  2182. self._notify_about_instance_usage(context, instance,
  2183. 'create.error', fault=e)
  2184. tb = traceback.format_exc()
  2185. compute_utils.notify_about_instance_create(
  2186. context, instance, self.host,
  2187. phase=fields.NotificationPhase.ERROR, exception=e,
  2188. bdms=block_device_mapping, tb=tb)
  2189. except exception.ComputeResourcesUnavailable as e:
  2190. LOG.debug(e.format_message(), instance=instance)
  2191. self._notify_about_instance_usage(context, instance,
  2192. 'create.error', fault=e)
  2193. tb = traceback.format_exc()
  2194. compute_utils.notify_about_instance_create(
  2195. context, instance, self.host,
  2196. phase=fields.NotificationPhase.ERROR, exception=e,
  2197. bdms=block_device_mapping, tb=tb)
  2198. raise exception.RescheduledException(
  2199. instance_uuid=instance.uuid, reason=e.format_message())
  2200. except exception.BuildAbortException as e:
  2201. with excutils.save_and_reraise_exception():
  2202. LOG.debug(e.format_message(), instance=instance)
  2203. self._notify_about_instance_usage(context, instance,
  2204. 'create.error', fault=e)
  2205. tb = traceback.format_exc()
  2206. compute_utils.notify_about_instance_create(
  2207. context, instance, self.host,
  2208. phase=fields.NotificationPhase.ERROR, exception=e,
  2209. bdms=block_device_mapping, tb=tb)
  2210. except (exception.FixedIpLimitExceeded,
  2211. exception.NoMoreNetworks, exception.NoMoreFixedIps) as e:
  2212. LOG.warning('No more network or fixed IP to be allocated',
  2213. instance=instance)
  2214. self._notify_about_instance_usage(context, instance,
  2215. 'create.error', fault=e)
  2216. tb = traceback.format_exc()
  2217. compute_utils.notify_about_instance_create(
  2218. context, instance, self.host,
  2219. phase=fields.NotificationPhase.ERROR, exception=e,
  2220. bdms=block_device_mapping, tb=tb)
  2221. msg = _('Failed to allocate the network(s) with error %s, '
  2222. 'not rescheduling.') % e.format_message()
  2223. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2224. reason=msg)
  2225. except (exception.VirtualInterfaceCreateException,
  2226. exception.VirtualInterfaceMacAddressException,
  2227. exception.FixedIpInvalidOnHost,
  2228. exception.UnableToAutoAllocateNetwork,
  2229. exception.NetworksWithQoSPolicyNotSupported) as e:
  2230. LOG.exception('Failed to allocate network(s)',
  2231. instance=instance)
  2232. self._notify_about_instance_usage(context, instance,
  2233. 'create.error', fault=e)
  2234. tb = traceback.format_exc()
  2235. compute_utils.notify_about_instance_create(
  2236. context, instance, self.host,
  2237. phase=fields.NotificationPhase.ERROR, exception=e,
  2238. bdms=block_device_mapping, tb=tb)
  2239. msg = _('Failed to allocate the network(s), not rescheduling.')
  2240. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2241. reason=msg)
  2242. except (exception.FlavorDiskTooSmall,
  2243. exception.FlavorMemoryTooSmall,
  2244. exception.ImageNotActive,
  2245. exception.ImageUnacceptable,
  2246. exception.InvalidDiskInfo,
  2247. exception.InvalidDiskFormat,
  2248. cursive_exception.SignatureVerificationError,
  2249. exception.CertificateValidationFailed,
  2250. exception.VolumeEncryptionNotSupported,
  2251. exception.InvalidInput,
  2252. # TODO(mriedem): We should be validating RequestedVRamTooHigh
  2253. # in the API during server create and rebuild.
  2254. exception.RequestedVRamTooHigh) as e:
  2255. self._notify_about_instance_usage(context, instance,
  2256. 'create.error', fault=e)
  2257. tb = traceback.format_exc()
  2258. compute_utils.notify_about_instance_create(
  2259. context, instance, self.host,
  2260. phase=fields.NotificationPhase.ERROR, exception=e,
  2261. bdms=block_device_mapping, tb=tb)
  2262. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2263. reason=e.format_message())
  2264. except Exception as e:
  2265. LOG.exception('Failed to build and run instance',
  2266. instance=instance)
  2267. self._notify_about_instance_usage(context, instance,
  2268. 'create.error', fault=e)
  2269. tb = traceback.format_exc()
  2270. compute_utils.notify_about_instance_create(
  2271. context, instance, self.host,
  2272. phase=fields.NotificationPhase.ERROR, exception=e,
  2273. bdms=block_device_mapping, tb=tb)
  2274. raise exception.RescheduledException(
  2275. instance_uuid=instance.uuid, reason=six.text_type(e))
  2276. # NOTE(alaski): This is only useful during reschedules, remove it now.
  2277. instance.system_metadata.pop('network_allocated', None)
  2278. # If CONF.default_access_ip_network_name is set, grab the
  2279. # corresponding network and set the access ip values accordingly.
  2280. network_name = CONF.default_access_ip_network_name
  2281. if (network_name and not instance.access_ip_v4 and
  2282. not instance.access_ip_v6):
  2283. # Note that when there are multiple ips to choose from, an
  2284. # arbitrary one will be chosen.
  2285. for vif in network_info:
  2286. if vif['network']['label'] == network_name:
  2287. for ip in vif.fixed_ips():
  2288. if not instance.access_ip_v4 and ip['version'] == 4:
  2289. instance.access_ip_v4 = ip['address']
  2290. if not instance.access_ip_v6 and ip['version'] == 6:
  2291. instance.access_ip_v6 = ip['address']
  2292. break
  2293. self._update_instance_after_spawn(context, instance)
  2294. try:
  2295. instance.save(expected_task_state=task_states.SPAWNING)
  2296. except (exception.InstanceNotFound,
  2297. exception.UnexpectedDeletingTaskStateError) as e:
  2298. with excutils.save_and_reraise_exception():
  2299. self._notify_about_instance_usage(context, instance,
  2300. 'create.error', fault=e)
  2301. tb = traceback.format_exc()
  2302. compute_utils.notify_about_instance_create(
  2303. context, instance, self.host,
  2304. phase=fields.NotificationPhase.ERROR, exception=e,
  2305. bdms=block_device_mapping, tb=tb)
  2306. self._update_scheduler_instance_info(context, instance)
  2307. self._notify_about_instance_usage(context, instance, 'create.end',
  2308. extra_usage_info={'message': _('Success')},
  2309. network_info=network_info)
  2310. compute_utils.notify_about_instance_create(context, instance,
  2311. self.host, phase=fields.NotificationPhase.END,
  2312. bdms=block_device_mapping)
  2313. @contextlib.contextmanager
  2314. def _build_resources(self, context, instance, requested_networks,
  2315. security_groups, image_meta, block_device_mapping,
  2316. resource_provider_mapping):
  2317. resources = {}
  2318. network_info = None
  2319. try:
  2320. LOG.debug('Start building networks asynchronously for instance.',
  2321. instance=instance)
  2322. network_info = self._build_networks_for_instance(context, instance,
  2323. requested_networks, security_groups,
  2324. resource_provider_mapping)
  2325. resources['network_info'] = network_info
  2326. except (exception.InstanceNotFound,
  2327. exception.UnexpectedDeletingTaskStateError):
  2328. raise
  2329. except exception.UnexpectedTaskStateError as e:
  2330. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2331. reason=e.format_message())
  2332. except Exception:
  2333. # Because this allocation is async any failures are likely to occur
  2334. # when the driver accesses network_info during spawn().
  2335. LOG.exception('Failed to allocate network(s)',
  2336. instance=instance)
  2337. msg = _('Failed to allocate the network(s), not rescheduling.')
  2338. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2339. reason=msg)
  2340. try:
  2341. # Perform any driver preparation work for the driver.
  2342. self.driver.prepare_for_spawn(instance)
  2343. # Depending on a virt driver, some network configuration is
  2344. # necessary before preparing block devices.
  2345. self.driver.prepare_networks_before_block_device_mapping(
  2346. instance, network_info)
  2347. # Verify that all the BDMs have a device_name set and assign a
  2348. # default to the ones missing it with the help of the driver.
  2349. self._default_block_device_names(instance, image_meta,
  2350. block_device_mapping)
  2351. LOG.debug('Start building block device mappings for instance.',
  2352. instance=instance)
  2353. instance.vm_state = vm_states.BUILDING
  2354. instance.task_state = task_states.BLOCK_DEVICE_MAPPING
  2355. instance.save()
  2356. block_device_info = self._prep_block_device(context, instance,
  2357. block_device_mapping)
  2358. resources['block_device_info'] = block_device_info
  2359. except (exception.InstanceNotFound,
  2360. exception.UnexpectedDeletingTaskStateError):
  2361. with excutils.save_and_reraise_exception():
  2362. # Make sure the async call finishes
  2363. if network_info is not None:
  2364. network_info.wait(do_raise=False)
  2365. self.driver.clean_networks_preparation(instance,
  2366. network_info)
  2367. self.driver.failed_spawn_cleanup(instance)
  2368. except (exception.UnexpectedTaskStateError,
  2369. exception.OverQuota, exception.InvalidBDM) as e:
  2370. # Make sure the async call finishes
  2371. if network_info is not None:
  2372. network_info.wait(do_raise=False)
  2373. self.driver.clean_networks_preparation(instance, network_info)
  2374. self.driver.failed_spawn_cleanup(instance)
  2375. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2376. reason=e.format_message())
  2377. except Exception:
  2378. LOG.exception('Failure prepping block device',
  2379. instance=instance)
  2380. # Make sure the async call finishes
  2381. if network_info is not None:
  2382. network_info.wait(do_raise=False)
  2383. self.driver.clean_networks_preparation(instance, network_info)
  2384. self.driver.failed_spawn_cleanup(instance)
  2385. msg = _('Failure prepping block device.')
  2386. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2387. reason=msg)
  2388. try:
  2389. yield resources
  2390. except Exception as exc:
  2391. with excutils.save_and_reraise_exception() as ctxt:
  2392. if not isinstance(exc, (
  2393. exception.InstanceNotFound,
  2394. exception.UnexpectedDeletingTaskStateError)):
  2395. LOG.exception('Instance failed to spawn',
  2396. instance=instance)
  2397. # Make sure the async call finishes
  2398. if network_info is not None:
  2399. network_info.wait(do_raise=False)
  2400. # if network_info is empty we're likely here because of
  2401. # network allocation failure. Since nothing can be reused on
  2402. # rescheduling it's better to deallocate network to eliminate
  2403. # the chance of orphaned ports in neutron
  2404. deallocate_networks = False if network_info else True
  2405. try:
  2406. self._shutdown_instance(context, instance,
  2407. block_device_mapping, requested_networks,
  2408. try_deallocate_networks=deallocate_networks)
  2409. except Exception as exc2:
  2410. ctxt.reraise = False
  2411. LOG.warning('Could not clean up failed build,'
  2412. ' not rescheduling. Error: %s',
  2413. six.text_type(exc2))
  2414. raise exception.BuildAbortException(
  2415. instance_uuid=instance.uuid,
  2416. reason=six.text_type(exc))
  2417. def _cleanup_allocated_networks(self, context, instance,
  2418. requested_networks):
  2419. try:
  2420. self._deallocate_network(context, instance, requested_networks)
  2421. except Exception:
  2422. LOG.exception('Failed to deallocate networks', instance=instance)
  2423. return
  2424. instance.system_metadata['network_allocated'] = 'False'
  2425. try:
  2426. instance.save()
  2427. except exception.InstanceNotFound:
  2428. # NOTE(alaski): It's possible that we're cleaning up the networks
  2429. # because the instance was deleted. If that's the case then this
  2430. # exception will be raised by instance.save()
  2431. pass
  2432. def _try_deallocate_network(self, context, instance,
  2433. requested_networks=None):
  2434. # During auto-scale cleanup, we could be deleting a large number
  2435. # of servers at the same time and overloading parts of the system,
  2436. # so we retry a few times in case of connection failures to the
  2437. # networking service.
  2438. @loopingcall.RetryDecorator(
  2439. max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,
  2440. exceptions=(keystone_exception.connection.ConnectFailure,))
  2441. def _deallocate_network_with_retries():
  2442. try:
  2443. self._deallocate_network(
  2444. context, instance, requested_networks)
  2445. except keystone_exception.connection.ConnectFailure as e:
  2446. # Provide a warning that something is amiss.
  2447. with excutils.save_and_reraise_exception():
  2448. LOG.warning('Failed to deallocate network for instance; '
  2449. 'retrying. Error: %s', six.text_type(e),
  2450. instance=instance)
  2451. try:
  2452. # tear down allocated network structure
  2453. _deallocate_network_with_retries()
  2454. except Exception as ex:
  2455. with excutils.save_and_reraise_exception():
  2456. LOG.error('Failed to deallocate network for instance. '
  2457. 'Error: %s', ex, instance=instance)
  2458. self._set_instance_obj_error_state(context, instance)
  2459. def _get_power_off_values(self, context, instance, clean_shutdown):
  2460. """Get the timing configuration for powering down this instance."""
  2461. if clean_shutdown:
  2462. timeout = compute_utils.get_value_from_system_metadata(instance,
  2463. key='image_os_shutdown_timeout', type=int,
  2464. default=CONF.shutdown_timeout)
  2465. retry_interval = CONF.compute.shutdown_retry_interval
  2466. else:
  2467. timeout = 0
  2468. retry_interval = 0
  2469. return timeout, retry_interval
  2470. def _power_off_instance(self, context, instance, clean_shutdown=True):
  2471. """Power off an instance on this host."""
  2472. timeout, retry_interval = self._get_power_off_values(context,
  2473. instance, clean_shutdown)
  2474. self.driver.power_off(instance, timeout, retry_interval)
  2475. def _shutdown_instance(self, context, instance,
  2476. bdms, requested_networks=None, notify=True,
  2477. try_deallocate_networks=True):
  2478. """Shutdown an instance on this host.
  2479. :param:context: security context
  2480. :param:instance: a nova.objects.Instance object
  2481. :param:bdms: the block devices for the instance to be torn
  2482. down
  2483. :param:requested_networks: the networks on which the instance
  2484. has ports
  2485. :param:notify: true if a final usage notification should be
  2486. emitted
  2487. :param:try_deallocate_networks: false if we should avoid
  2488. trying to teardown networking
  2489. """
  2490. context = context.elevated()
  2491. LOG.info('Terminating instance', instance=instance)
  2492. if notify:
  2493. self._notify_about_instance_usage(context, instance,
  2494. "shutdown.start")
  2495. compute_utils.notify_about_instance_action(context, instance,
  2496. self.host, action=fields.NotificationAction.SHUTDOWN,
  2497. phase=fields.NotificationPhase.START, bdms=bdms)
  2498. network_info = instance.get_network_info()
  2499. # NOTE(arnaudmorin) to avoid nova destroying the instance without
  2500. # unplugging the interface, refresh network_info if it is empty.
  2501. if not network_info:
  2502. network_info = self.network_api.get_instance_nw_info(
  2503. context, instance)
  2504. # NOTE(vish) get bdms before destroying the instance
  2505. vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
  2506. block_device_info = self._get_instance_block_device_info(
  2507. context, instance, bdms=bdms)
  2508. # NOTE(melwitt): attempt driver destroy before releasing ip, may
  2509. # want to keep ip allocated for certain failures
  2510. try:
  2511. LOG.debug('Start destroying the instance on the hypervisor.',
  2512. instance=instance)
  2513. with timeutils.StopWatch() as timer:
  2514. self.driver.destroy(context, instance, network_info,
  2515. block_device_info)
  2516. LOG.info('Took %0.2f seconds to destroy the instance on the '
  2517. 'hypervisor.', timer.elapsed(), instance=instance)
  2518. except exception.InstancePowerOffFailure:
  2519. # if the instance can't power off, don't release the ip
  2520. with excutils.save_and_reraise_exception():
  2521. pass
  2522. except Exception:
  2523. with excutils.save_and_reraise_exception():
  2524. # deallocate ip and fail without proceeding to
  2525. # volume api calls, preserving current behavior
  2526. if try_deallocate_networks:
  2527. self._try_deallocate_network(context, instance,
  2528. requested_networks)
  2529. if try_deallocate_networks:
  2530. self._try_deallocate_network(context, instance, requested_networks)
  2531. timer.restart()
  2532. for bdm in vol_bdms:
  2533. try:
  2534. if bdm.attachment_id:
  2535. self.volume_api.attachment_delete(context,
  2536. bdm.attachment_id)
  2537. else:
  2538. # NOTE(vish): actual driver detach done in driver.destroy,
  2539. # so just tell cinder that we are done with it.
  2540. connector = self.driver.get_volume_connector(instance)
  2541. self.volume_api.terminate_connection(context,
  2542. bdm.volume_id,
  2543. connector)
  2544. self.volume_api.detach(context, bdm.volume_id,
  2545. instance.uuid)
  2546. except exception.VolumeAttachmentNotFound as exc:
  2547. LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
  2548. instance=instance)
  2549. except exception.DiskNotFound as exc:
  2550. LOG.debug('Ignoring DiskNotFound: %s', exc,
  2551. instance=instance)
  2552. except exception.VolumeNotFound as exc:
  2553. LOG.debug('Ignoring VolumeNotFound: %s', exc,
  2554. instance=instance)
  2555. except (cinder_exception.EndpointNotFound,
  2556. keystone_exception.EndpointNotFound) as exc:
  2557. LOG.warning('Ignoring EndpointNotFound for '
  2558. 'volume %(volume_id)s: %(exc)s',
  2559. {'exc': exc, 'volume_id': bdm.volume_id},
  2560. instance=instance)
  2561. except cinder_exception.ClientException as exc:
  2562. LOG.warning('Ignoring unknown cinder exception for '
  2563. 'volume %(volume_id)s: %(exc)s',
  2564. {'exc': exc, 'volume_id': bdm.volume_id},
  2565. instance=instance)
  2566. except Exception as exc:
  2567. LOG.warning('Ignoring unknown exception for '
  2568. 'volume %(volume_id)s: %(exc)s',
  2569. {'exc': exc, 'volume_id': bdm.volume_id},
  2570. instance=instance)
  2571. if vol_bdms:
  2572. LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
  2573. 'for instance.',
  2574. {'time': timer.elapsed(), 'num': len(vol_bdms)},
  2575. instance=instance)
  2576. if notify:
  2577. self._notify_about_instance_usage(context, instance,
  2578. "shutdown.end")
  2579. compute_utils.notify_about_instance_action(context, instance,
  2580. self.host, action=fields.NotificationAction.SHUTDOWN,
  2581. phase=fields.NotificationPhase.END, bdms=bdms)
  2582. def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
  2583. detach=True):
  2584. exc_info = None
  2585. for bdm in bdms:
  2586. if detach and bdm.volume_id:
  2587. try:
  2588. LOG.debug("Detaching volume: %s", bdm.volume_id,
  2589. instance_uuid=instance.uuid)
  2590. destroy = bdm.delete_on_termination
  2591. self._detach_volume(context, bdm, instance,
  2592. destroy_bdm=destroy)
  2593. except Exception as exc:
  2594. exc_info = sys.exc_info()
  2595. LOG.warning('Failed to detach volume: %(volume_id)s '
  2596. 'due to %(exc)s',
  2597. {'volume_id': bdm.volume_id, 'exc': exc})
  2598. if bdm.volume_id and bdm.delete_on_termination:
  2599. try:
  2600. LOG.debug("Deleting volume: %s", bdm.volume_id,
  2601. instance_uuid=instance.uuid)
  2602. self.volume_api.delete(context, bdm.volume_id)
  2603. except Exception as exc:
  2604. exc_info = sys.exc_info()
  2605. LOG.warning('Failed to delete volume: %(volume_id)s '
  2606. 'due to %(exc)s',
  2607. {'volume_id': bdm.volume_id, 'exc': exc})
  2608. if exc_info is not None and raise_exc:
  2609. six.reraise(exc_info[0], exc_info[1], exc_info[2])
  2610. @hooks.add_hook("delete_instance")
  2611. def _delete_instance(self, context, instance, bdms):
  2612. """Delete an instance on this host.
  2613. :param context: nova request context
  2614. :param instance: nova.objects.instance.Instance object
  2615. :param bdms: nova.objects.block_device.BlockDeviceMappingList object
  2616. """
  2617. events = self.instance_events.clear_events_for_instance(instance)
  2618. if events:
  2619. LOG.debug('Events pending at deletion: %(events)s',
  2620. {'events': ','.join(events.keys())},
  2621. instance=instance)
  2622. self._notify_about_instance_usage(context, instance,
  2623. "delete.start")
  2624. compute_utils.notify_about_instance_action(context, instance,
  2625. self.host, action=fields.NotificationAction.DELETE,
  2626. phase=fields.NotificationPhase.START, bdms=bdms)
  2627. self._shutdown_instance(context, instance, bdms)
  2628. # NOTE(vish): We have already deleted the instance, so we have
  2629. # to ignore problems cleaning up the volumes. It
  2630. # would be nice to let the user know somehow that
  2631. # the volume deletion failed, but it is not
  2632. # acceptable to have an instance that can not be
  2633. # deleted. Perhaps this could be reworked in the
  2634. # future to set an instance fault the first time
  2635. # and to only ignore the failure if the instance
  2636. # is already in ERROR.
  2637. # NOTE(ameeda): The volumes already detached during the above
  2638. # _shutdown_instance() call and this is why
  2639. # detach is not requested from _cleanup_volumes()
  2640. # in this case
  2641. self._cleanup_volumes(context, instance, bdms,
  2642. raise_exc=False, detach=False)
  2643. # if a delete task succeeded, always update vm state and task
  2644. # state without expecting task state to be DELETING
  2645. instance.vm_state = vm_states.DELETED
  2646. instance.task_state = None
  2647. instance.power_state = power_state.NOSTATE
  2648. instance.terminated_at = timeutils.utcnow()
  2649. instance.save()
  2650. self._complete_deletion(context, instance)
  2651. # only destroy the instance in the db if the _complete_deletion
  2652. # doesn't raise and therefore allocation is successfully
  2653. # deleted in placement
  2654. instance.destroy()
  2655. self._notify_about_instance_usage(context, instance, "delete.end")
  2656. compute_utils.notify_about_instance_action(context, instance,
  2657. self.host, action=fields.NotificationAction.DELETE,
  2658. phase=fields.NotificationPhase.END, bdms=bdms)
  2659. @wrap_exception()
  2660. @reverts_task_state
  2661. @wrap_instance_event(prefix='compute')
  2662. @wrap_instance_fault
  2663. def terminate_instance(self, context, instance, bdms):
  2664. """Terminate an instance on this host."""
  2665. @utils.synchronized(instance.uuid)
  2666. def do_terminate_instance(instance, bdms):
  2667. # NOTE(mriedem): If we are deleting the instance while it was
  2668. # booting from volume, we could be racing with a database update of
  2669. # the BDM volume_id. Since the compute API passes the BDMs over RPC
  2670. # to compute here, the BDMs may be stale at this point. So check
  2671. # for any volume BDMs that don't have volume_id set and if we
  2672. # detect that, we need to refresh the BDM list before proceeding.
  2673. # TODO(mriedem): Move this into _delete_instance and make the bdms
  2674. # parameter optional.
  2675. for bdm in list(bdms):
  2676. if bdm.is_volume and not bdm.volume_id:
  2677. LOG.debug('There are potentially stale BDMs during '
  2678. 'delete, refreshing the BlockDeviceMappingList.',
  2679. instance=instance)
  2680. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2681. context, instance.uuid)
  2682. break
  2683. try:
  2684. self._delete_instance(context, instance, bdms)
  2685. except exception.InstanceNotFound:
  2686. LOG.info("Instance disappeared during terminate",
  2687. instance=instance)
  2688. except Exception:
  2689. # As we're trying to delete always go to Error if something
  2690. # goes wrong that _delete_instance can't handle.
  2691. with excutils.save_and_reraise_exception():
  2692. LOG.exception('Setting instance vm_state to ERROR',
  2693. instance=instance)
  2694. self._set_instance_obj_error_state(context, instance)
  2695. do_terminate_instance(instance, bdms)
  2696. # NOTE(johannes): This is probably better named power_off_instance
  2697. # so it matches the driver method, but because of other issues, we
  2698. # can't use that name in grizzly.
  2699. @wrap_exception()
  2700. @reverts_task_state
  2701. @wrap_instance_event(prefix='compute')
  2702. @wrap_instance_fault
  2703. def stop_instance(self, context, instance, clean_shutdown):
  2704. """Stopping an instance on this host."""
  2705. @utils.synchronized(instance.uuid)
  2706. def do_stop_instance():
  2707. current_power_state = self._get_power_state(context, instance)
  2708. LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
  2709. 'current task_state: %(task_state)s, current DB '
  2710. 'power_state: %(db_power_state)s, current VM '
  2711. 'power_state: %(current_power_state)s',
  2712. {'vm_state': instance.vm_state,
  2713. 'task_state': instance.task_state,
  2714. 'db_power_state': instance.power_state,
  2715. 'current_power_state': current_power_state},
  2716. instance_uuid=instance.uuid)
  2717. # NOTE(mriedem): If the instance is already powered off, we are
  2718. # possibly tearing down and racing with other operations, so we can
  2719. # expect the task_state to be None if something else updates the
  2720. # instance and we're not locking it.
  2721. expected_task_state = [task_states.POWERING_OFF]
  2722. # The list of power states is from _sync_instance_power_state.
  2723. if current_power_state in (power_state.NOSTATE,
  2724. power_state.SHUTDOWN,
  2725. power_state.CRASHED):
  2726. LOG.info('Instance is already powered off in the '
  2727. 'hypervisor when stop is called.',
  2728. instance=instance)
  2729. expected_task_state.append(None)
  2730. self._notify_about_instance_usage(context, instance,
  2731. "power_off.start")
  2732. compute_utils.notify_about_instance_action(context, instance,
  2733. self.host, action=fields.NotificationAction.POWER_OFF,
  2734. phase=fields.NotificationPhase.START)
  2735. self._power_off_instance(context, instance, clean_shutdown)
  2736. instance.power_state = self._get_power_state(context, instance)
  2737. instance.vm_state = vm_states.STOPPED
  2738. instance.task_state = None
  2739. instance.save(expected_task_state=expected_task_state)
  2740. self._notify_about_instance_usage(context, instance,
  2741. "power_off.end")
  2742. compute_utils.notify_about_instance_action(context, instance,
  2743. self.host, action=fields.NotificationAction.POWER_OFF,
  2744. phase=fields.NotificationPhase.END)
  2745. do_stop_instance()
  2746. def _power_on(self, context, instance):
  2747. network_info = self.network_api.get_instance_nw_info(context, instance)
  2748. block_device_info = self._get_instance_block_device_info(context,
  2749. instance)
  2750. self.driver.power_on(context, instance,
  2751. network_info,
  2752. block_device_info)
  2753. def _delete_snapshot_of_shelved_instance(self, context, instance,
  2754. snapshot_id):
  2755. """Delete snapshot of shelved instance."""
  2756. try:
  2757. self.image_api.delete(context, snapshot_id)
  2758. except (exception.ImageNotFound,
  2759. exception.ImageNotAuthorized) as exc:
  2760. LOG.warning("Failed to delete snapshot "
  2761. "from shelved instance (%s).",
  2762. exc.format_message(), instance=instance)
  2763. except Exception:
  2764. LOG.exception("Something wrong happened when trying to "
  2765. "delete snapshot from shelved instance.",
  2766. instance=instance)
  2767. # NOTE(johannes): This is probably better named power_on_instance
  2768. # so it matches the driver method, but because of other issues, we
  2769. # can't use that name in grizzly.
  2770. @wrap_exception()
  2771. @reverts_task_state
  2772. @wrap_instance_event(prefix='compute')
  2773. @wrap_instance_fault
  2774. def start_instance(self, context, instance):
  2775. """Starting an instance on this host."""
  2776. self._notify_about_instance_usage(context, instance, "power_on.start")
  2777. compute_utils.notify_about_instance_action(context, instance,
  2778. self.host, action=fields.NotificationAction.POWER_ON,
  2779. phase=fields.NotificationPhase.START)
  2780. self._power_on(context, instance)
  2781. instance.power_state = self._get_power_state(context, instance)
  2782. instance.vm_state = vm_states.ACTIVE
  2783. instance.task_state = None
  2784. # Delete an image(VM snapshot) for a shelved instance
  2785. snapshot_id = instance.system_metadata.get('shelved_image_id')
  2786. if snapshot_id:
  2787. self._delete_snapshot_of_shelved_instance(context, instance,
  2788. snapshot_id)
  2789. # Delete system_metadata for a shelved instance
  2790. compute_utils.remove_shelved_keys_from_system_metadata(instance)
  2791. instance.save(expected_task_state=task_states.POWERING_ON)
  2792. self._notify_about_instance_usage(context, instance, "power_on.end")
  2793. compute_utils.notify_about_instance_action(context, instance,
  2794. self.host, action=fields.NotificationAction.POWER_ON,
  2795. phase=fields.NotificationPhase.END)
  2796. @messaging.expected_exceptions(NotImplementedError,
  2797. exception.TriggerCrashDumpNotSupported,
  2798. exception.InstanceNotRunning)
  2799. @wrap_exception()
  2800. @wrap_instance_event(prefix='compute')
  2801. @wrap_instance_fault
  2802. def trigger_crash_dump(self, context, instance):
  2803. """Trigger crash dump in an instance."""
  2804. self._notify_about_instance_usage(context, instance,
  2805. "trigger_crash_dump.start")
  2806. compute_utils.notify_about_instance_action(context, instance,
  2807. self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
  2808. phase=fields.NotificationPhase.START)
  2809. # This method does not change task_state and power_state because the
  2810. # effect of a trigger depends on user's configuration.
  2811. self.driver.trigger_crash_dump(instance)
  2812. self._notify_about_instance_usage(context, instance,
  2813. "trigger_crash_dump.end")
  2814. compute_utils.notify_about_instance_action(context, instance,
  2815. self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
  2816. phase=fields.NotificationPhase.END)
  2817. @wrap_exception()
  2818. @reverts_task_state
  2819. @wrap_instance_event(prefix='compute')
  2820. @wrap_instance_fault
  2821. def soft_delete_instance(self, context, instance):
  2822. """Soft delete an instance on this host."""
  2823. with compute_utils.notify_about_instance_delete(
  2824. self.notifier, context, instance, 'soft_delete',
  2825. source=fields.NotificationSource.COMPUTE):
  2826. try:
  2827. self.driver.soft_delete(instance)
  2828. except NotImplementedError:
  2829. # Fallback to just powering off the instance if the
  2830. # hypervisor doesn't implement the soft_delete method
  2831. self.driver.power_off(instance)
  2832. instance.power_state = self._get_power_state(context, instance)
  2833. instance.vm_state = vm_states.SOFT_DELETED
  2834. instance.task_state = None
  2835. instance.save(expected_task_state=[task_states.SOFT_DELETING])
  2836. @wrap_exception()
  2837. @reverts_task_state
  2838. @wrap_instance_event(prefix='compute')
  2839. @wrap_instance_fault
  2840. def restore_instance(self, context, instance):
  2841. """Restore a soft-deleted instance on this host."""
  2842. self._notify_about_instance_usage(context, instance, "restore.start")
  2843. compute_utils.notify_about_instance_action(context, instance,
  2844. self.host, action=fields.NotificationAction.RESTORE,
  2845. phase=fields.NotificationPhase.START)
  2846. try:
  2847. self.driver.restore(instance)
  2848. except NotImplementedError:
  2849. # Fallback to just powering on the instance if the hypervisor
  2850. # doesn't implement the restore method
  2851. self._power_on(context, instance)
  2852. instance.power_state = self._get_power_state(context, instance)
  2853. instance.vm_state = vm_states.ACTIVE
  2854. instance.task_state = None
  2855. instance.save(expected_task_state=task_states.RESTORING)
  2856. self._notify_about_instance_usage(context, instance, "restore.end")
  2857. compute_utils.notify_about_instance_action(context, instance,
  2858. self.host, action=fields.NotificationAction.RESTORE,
  2859. phase=fields.NotificationPhase.END)
  2860. @staticmethod
  2861. def _set_migration_status(migration, status):
  2862. """Set the status, and guard against a None being passed in.
  2863. This is useful as some of the compute RPC calls will not pass
  2864. a migration object in older versions. The check can be removed when
  2865. we move past 4.x major version of the RPC API.
  2866. """
  2867. if migration:
  2868. migration.status = status
  2869. migration.save()
  2870. def _rebuild_default_impl(self, context, instance, image_meta,
  2871. injected_files, admin_password, allocations,
  2872. bdms, detach_block_devices, attach_block_devices,
  2873. network_info=None,
  2874. evacuate=False, block_device_info=None,
  2875. preserve_ephemeral=False):
  2876. if preserve_ephemeral:
  2877. # The default code path does not support preserving ephemeral
  2878. # partitions.
  2879. raise exception.PreserveEphemeralNotSupported()
  2880. if evacuate:
  2881. detach_block_devices(context, bdms)
  2882. else:
  2883. self._power_off_instance(context, instance, clean_shutdown=True)
  2884. detach_block_devices(context, bdms)
  2885. self.driver.destroy(context, instance,
  2886. network_info=network_info,
  2887. block_device_info=block_device_info)
  2888. instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
  2889. instance.save(expected_task_state=[task_states.REBUILDING])
  2890. new_block_device_info = attach_block_devices(context, instance, bdms)
  2891. instance.task_state = task_states.REBUILD_SPAWNING
  2892. instance.save(
  2893. expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
  2894. with instance.mutated_migration_context():
  2895. self.driver.spawn(context, instance, image_meta, injected_files,
  2896. admin_password, allocations,
  2897. network_info=network_info,
  2898. block_device_info=new_block_device_info)
  2899. def _notify_instance_rebuild_error(self, context, instance, error, bdms):
  2900. tb = traceback.format_exc()
  2901. self._notify_about_instance_usage(context, instance,
  2902. 'rebuild.error', fault=error)
  2903. compute_utils.notify_about_instance_rebuild(
  2904. context, instance, self.host,
  2905. phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms,
  2906. tb=tb)
  2907. @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)
  2908. @wrap_exception()
  2909. @reverts_task_state
  2910. @wrap_instance_event(prefix='compute')
  2911. @wrap_instance_fault
  2912. def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
  2913. injected_files, new_pass, orig_sys_metadata,
  2914. bdms, recreate, on_shared_storage,
  2915. preserve_ephemeral, migration,
  2916. scheduled_node, limits, request_spec):
  2917. """Destroy and re-make this instance.
  2918. A 'rebuild' effectively purges all existing data from the system and
  2919. remakes the VM with given 'metadata' and 'personalities'.
  2920. :param context: `nova.RequestContext` object
  2921. :param instance: Instance object
  2922. :param orig_image_ref: Original image_ref before rebuild
  2923. :param image_ref: New image_ref for rebuild
  2924. :param injected_files: Files to inject
  2925. :param new_pass: password to set on rebuilt instance
  2926. :param orig_sys_metadata: instance system metadata from pre-rebuild
  2927. :param bdms: block-device-mappings to use for rebuild
  2928. :param recreate: True if the instance is being evacuated (e.g. the
  2929. hypervisor it was on failed) - cleanup of old state will be
  2930. skipped.
  2931. :param on_shared_storage: True if instance files on shared storage.
  2932. If not provided then information from the
  2933. driver will be used to decide if the instance
  2934. files are available or not on the target host
  2935. :param preserve_ephemeral: True if the default ephemeral storage
  2936. partition must be preserved on rebuild
  2937. :param migration: a Migration object if one was created for this
  2938. rebuild operation (if it's a part of evacuate)
  2939. :param scheduled_node: A node of the host chosen by the scheduler. If a
  2940. host was specified by the user, this will be
  2941. None
  2942. :param limits: Overcommit limits set by the scheduler. If a host was
  2943. specified by the user, this will be None
  2944. :param request_spec: a RequestSpec object used to schedule the instance
  2945. """
  2946. # recreate=True means the instance is being evacuated from a failed
  2947. # host to a new destination host (this host). The 'recreate' variable
  2948. # name is confusing, so rename it to evacuate here at the top, which
  2949. # is simpler than renaming a parameter in an RPC versioned method.
  2950. evacuate = recreate
  2951. context = context.elevated()
  2952. if evacuate:
  2953. LOG.info("Evacuating instance", instance=instance)
  2954. else:
  2955. LOG.info("Rebuilding instance", instance=instance)
  2956. if evacuate:
  2957. # This is an evacuation to a new host, so we need to perform a
  2958. # resource claim.
  2959. rebuild_claim = self.rt.rebuild_claim
  2960. else:
  2961. # This is a rebuild to the same host, so we don't need to make
  2962. # a claim since the instance is already on this host.
  2963. rebuild_claim = claims.NopClaim
  2964. if image_ref:
  2965. image_meta = objects.ImageMeta.from_image_ref(
  2966. context, self.image_api, image_ref)
  2967. elif evacuate:
  2968. # For evacuate the API does not send down the image_ref since the
  2969. # image does not change so just get it from what was stashed in
  2970. # the instance system_metadata when the instance was created (or
  2971. # last rebuilt). This also works for volume-backed instances.
  2972. image_meta = instance.image_meta
  2973. else:
  2974. image_meta = objects.ImageMeta()
  2975. # NOTE(mriedem): On an evacuate, we need to update
  2976. # the instance's host and node properties to reflect it's
  2977. # destination node for the evacuate.
  2978. if not scheduled_node:
  2979. if evacuate:
  2980. try:
  2981. compute_node = self._get_compute_info(context, self.host)
  2982. scheduled_node = compute_node.hypervisor_hostname
  2983. except exception.ComputeHostNotFound:
  2984. LOG.exception('Failed to get compute_info for %s',
  2985. self.host)
  2986. else:
  2987. scheduled_node = instance.node
  2988. allocs = self.reportclient.get_allocations_for_consumer(
  2989. context, instance.uuid)
  2990. # If the resource claim or group policy validation fails before we
  2991. # do anything to the guest or its networking/volumes we want to keep
  2992. # the current status rather than put the instance into ERROR status.
  2993. instance_state = instance.vm_state
  2994. with self._error_out_instance_on_exception(
  2995. context, instance, instance_state=instance_state):
  2996. try:
  2997. self._do_rebuild_instance_with_claim(
  2998. context, instance, orig_image_ref,
  2999. image_meta, injected_files, new_pass, orig_sys_metadata,
  3000. bdms, evacuate, on_shared_storage, preserve_ephemeral,
  3001. migration, request_spec, allocs, rebuild_claim,
  3002. scheduled_node, limits)
  3003. except (exception.ComputeResourcesUnavailable,
  3004. exception.RescheduledException) as e:
  3005. if isinstance(e, exception.ComputeResourcesUnavailable):
  3006. LOG.debug("Could not rebuild instance on this host, not "
  3007. "enough resources available.", instance=instance)
  3008. else:
  3009. # RescheduledException is raised by the late server group
  3010. # policy check during evacuation if a parallel scheduling
  3011. # violated the policy.
  3012. # We catch the RescheduledException here but we don't have
  3013. # the plumbing to do an actual reschedule so we abort the
  3014. # operation.
  3015. LOG.debug("Could not rebuild instance on this host, "
  3016. "late server group check failed.",
  3017. instance=instance)
  3018. # NOTE(ndipanov): We just abort the build for now and leave a
  3019. # migration record for potential cleanup later
  3020. self._set_migration_status(migration, 'failed')
  3021. # Since the claim failed, we need to remove the allocation
  3022. # created against the destination node. Note that we can only
  3023. # get here when evacuating to a destination node. Rebuilding
  3024. # on the same host (not evacuate) uses the NopClaim which will
  3025. # not raise ComputeResourcesUnavailable.
  3026. self.rt.delete_allocation_for_evacuated_instance(
  3027. context, instance, scheduled_node, node_type='destination')
  3028. self._notify_instance_rebuild_error(context, instance, e, bdms)
  3029. # Wrap this in InstanceFaultRollback so that the
  3030. # _error_out_instance_on_exception context manager keeps the
  3031. # vm_state unchanged.
  3032. raise exception.InstanceFaultRollback(
  3033. inner_exception=exception.BuildAbortException(
  3034. instance_uuid=instance.uuid,
  3035. reason=e.format_message()))
  3036. except (exception.InstanceNotFound,
  3037. exception.UnexpectedDeletingTaskStateError) as e:
  3038. LOG.debug('Instance was deleted while rebuilding',
  3039. instance=instance)
  3040. self._set_migration_status(migration, 'failed')
  3041. self._notify_instance_rebuild_error(context, instance, e, bdms)
  3042. except Exception as e:
  3043. self._set_migration_status(migration, 'failed')
  3044. if evacuate or scheduled_node is not None:
  3045. self.rt.delete_allocation_for_evacuated_instance(
  3046. context, instance, scheduled_node,
  3047. node_type='destination')
  3048. self._notify_instance_rebuild_error(context, instance, e, bdms)
  3049. raise
  3050. else:
  3051. instance.apply_migration_context()
  3052. # NOTE (ndipanov): This save will now update the host and node
  3053. # attributes making sure that next RT pass is consistent since
  3054. # it will be based on the instance and not the migration DB
  3055. # entry.
  3056. instance.host = self.host
  3057. instance.node = scheduled_node
  3058. instance.save()
  3059. instance.drop_migration_context()
  3060. # NOTE (ndipanov): Mark the migration as done only after we
  3061. # mark the instance as belonging to this host.
  3062. self._set_migration_status(migration, 'done')
  3063. def _do_rebuild_instance_with_claim(
  3064. self, context, instance, orig_image_ref, image_meta,
  3065. injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
  3066. on_shared_storage, preserve_ephemeral, migration, request_spec,
  3067. allocations, rebuild_claim, scheduled_node, limits):
  3068. """Helper to avoid deep nesting in the top-level method."""
  3069. request_group_resource_providers_mapping = None
  3070. if evacuate:
  3071. request_group_resource_providers_mapping = \
  3072. self._get_request_group_mapping(request_spec)
  3073. if request_group_resource_providers_mapping:
  3074. self._update_pci_request_spec_with_allocated_interface_name(
  3075. context, instance,
  3076. request_group_resource_providers_mapping)
  3077. claim_context = rebuild_claim(
  3078. context, instance, scheduled_node, allocations,
  3079. limits=limits, image_meta=image_meta, migration=migration)
  3080. with claim_context:
  3081. self._do_rebuild_instance(
  3082. context, instance, orig_image_ref, image_meta, injected_files,
  3083. new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage,
  3084. preserve_ephemeral, migration, request_spec, allocations,
  3085. request_group_resource_providers_mapping)
  3086. @staticmethod
  3087. def _get_image_name(image_meta):
  3088. if image_meta.obj_attr_is_set("name"):
  3089. return image_meta.name
  3090. else:
  3091. return ''
  3092. def _do_rebuild_instance(self, context, instance, orig_image_ref,
  3093. image_meta, injected_files, new_pass,
  3094. orig_sys_metadata, bdms, evacuate,
  3095. on_shared_storage, preserve_ephemeral,
  3096. migration, request_spec, allocations,
  3097. request_group_resource_providers_mapping):
  3098. orig_vm_state = instance.vm_state
  3099. if evacuate:
  3100. if request_spec:
  3101. # NOTE(gibi): Do a late check of server group policy as
  3102. # parallel scheduling could violate such policy. This will
  3103. # cause the evacuate to fail as rebuild does not implement
  3104. # reschedule.
  3105. hints = self._get_scheduler_hints({}, request_spec)
  3106. self._validate_instance_group_policy(context, instance, hints)
  3107. if not self.driver.capabilities.get("supports_evacuate", False):
  3108. raise exception.InstanceEvacuateNotSupported
  3109. self._check_instance_exists(context, instance)
  3110. if on_shared_storage is None:
  3111. LOG.debug('on_shared_storage is not provided, using driver '
  3112. 'information to decide if the instance needs to '
  3113. 'be evacuated')
  3114. on_shared_storage = self.driver.instance_on_disk(instance)
  3115. elif (on_shared_storage !=
  3116. self.driver.instance_on_disk(instance)):
  3117. # To cover case when admin expects that instance files are
  3118. # on shared storage, but not accessible and vice versa
  3119. raise exception.InvalidSharedStorage(
  3120. _("Invalid state of instance files on shared"
  3121. " storage"))
  3122. if on_shared_storage:
  3123. LOG.info('disk on shared storage, evacuating using'
  3124. ' existing disk')
  3125. elif instance.image_ref:
  3126. orig_image_ref = instance.image_ref
  3127. LOG.info("disk not on shared storage, evacuating from "
  3128. "image: '%s'", str(orig_image_ref))
  3129. else:
  3130. LOG.info('disk on volume, evacuating using existing '
  3131. 'volume')
  3132. # We check trusted certs capabilities for both evacuate (rebuild on
  3133. # another host) and rebuild (rebuild on the same host) because for
  3134. # evacuate we need to make sure an instance with trusted certs can
  3135. # have the image verified with those certs during rebuild, and for
  3136. # rebuild we could be rebuilding a server that started out with no
  3137. # trusted certs on this host, and then was rebuilt with trusted certs
  3138. # for a new image, in which case we need to validate that new image
  3139. # with the trusted certs during the rebuild.
  3140. self._check_trusted_certs(instance)
  3141. # This instance.exists message should contain the original
  3142. # image_ref, not the new one. Since the DB has been updated
  3143. # to point to the new one... we have to override it.
  3144. orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
  3145. context)
  3146. extra_usage_info = {'image_ref_url': orig_image_ref_url}
  3147. compute_utils.notify_usage_exists(
  3148. self.notifier, context, instance, self.host,
  3149. current_period=True, system_metadata=orig_sys_metadata,
  3150. extra_usage_info=extra_usage_info)
  3151. # This message should contain the new image_ref
  3152. extra_usage_info = {'image_name': self._get_image_name(image_meta)}
  3153. self._notify_about_instance_usage(context, instance,
  3154. "rebuild.start", extra_usage_info=extra_usage_info)
  3155. # NOTE: image_name is not included in the versioned notification
  3156. # because we already provide the image_uuid in the notification
  3157. # payload and the image details can be looked up via the uuid.
  3158. compute_utils.notify_about_instance_rebuild(
  3159. context, instance, self.host,
  3160. phase=fields.NotificationPhase.START,
  3161. bdms=bdms)
  3162. instance.power_state = self._get_power_state(context, instance)
  3163. instance.task_state = task_states.REBUILDING
  3164. instance.save(expected_task_state=[task_states.REBUILDING])
  3165. if evacuate:
  3166. self.network_api.setup_networks_on_host(
  3167. context, instance, self.host)
  3168. # For nova-network this is needed to move floating IPs
  3169. # For neutron this updates the host in the port binding
  3170. # TODO(cfriesen): this network_api call and the one above
  3171. # are so similar, we should really try to unify them.
  3172. self.network_api.setup_instance_network_on_host(
  3173. context, instance, self.host, migration,
  3174. provider_mappings=request_group_resource_providers_mapping)
  3175. # TODO(mriedem): Consider decorating setup_instance_network_on_host
  3176. # with @base_api.refresh_cache and then we wouldn't need this
  3177. # explicit call to get_instance_nw_info.
  3178. network_info = self.network_api.get_instance_nw_info(context,
  3179. instance)
  3180. else:
  3181. network_info = instance.get_network_info()
  3182. if bdms is None:
  3183. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3184. context, instance.uuid)
  3185. block_device_info = \
  3186. self._get_instance_block_device_info(
  3187. context, instance, bdms=bdms)
  3188. def detach_block_devices(context, bdms):
  3189. for bdm in bdms:
  3190. if bdm.is_volume:
  3191. # NOTE (ildikov): Having the attachment_id set in the BDM
  3192. # means that it's the new Cinder attach/detach flow
  3193. # (available from v3.44). In that case we explicitly
  3194. # attach and detach the volumes through attachment level
  3195. # operations. In this scenario _detach_volume will delete
  3196. # the existing attachment which would make the volume
  3197. # status change to 'available' if we don't pre-create
  3198. # another empty attachment before deleting the old one.
  3199. attachment_id = None
  3200. if bdm.attachment_id:
  3201. attachment_id = self.volume_api.attachment_create(
  3202. context, bdm['volume_id'], instance.uuid)['id']
  3203. self._detach_volume(context, bdm, instance,
  3204. destroy_bdm=False)
  3205. if attachment_id:
  3206. bdm.attachment_id = attachment_id
  3207. bdm.save()
  3208. files = self._decode_files(injected_files)
  3209. kwargs = dict(
  3210. context=context,
  3211. instance=instance,
  3212. image_meta=image_meta,
  3213. injected_files=files,
  3214. admin_password=new_pass,
  3215. allocations=allocations,
  3216. bdms=bdms,
  3217. detach_block_devices=detach_block_devices,
  3218. attach_block_devices=self._prep_block_device,
  3219. block_device_info=block_device_info,
  3220. network_info=network_info,
  3221. preserve_ephemeral=preserve_ephemeral,
  3222. evacuate=evacuate)
  3223. try:
  3224. with instance.mutated_migration_context():
  3225. self.driver.rebuild(**kwargs)
  3226. except NotImplementedError:
  3227. # NOTE(rpodolyaka): driver doesn't provide specialized version
  3228. # of rebuild, fall back to the default implementation
  3229. self._rebuild_default_impl(**kwargs)
  3230. self._update_instance_after_spawn(context, instance)
  3231. instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
  3232. if orig_vm_state == vm_states.STOPPED:
  3233. LOG.info("bringing vm to original state: '%s'",
  3234. orig_vm_state, instance=instance)
  3235. instance.vm_state = vm_states.ACTIVE
  3236. instance.task_state = task_states.POWERING_OFF
  3237. instance.progress = 0
  3238. instance.save()
  3239. self.stop_instance(context, instance, False)
  3240. # TODO(melwitt): We should clean up instance console tokens here in the
  3241. # case of evacuate. The instance is on a new host and will need to
  3242. # establish a new console connection.
  3243. self._update_scheduler_instance_info(context, instance)
  3244. self._notify_about_instance_usage(
  3245. context, instance, "rebuild.end",
  3246. network_info=network_info,
  3247. extra_usage_info=extra_usage_info)
  3248. compute_utils.notify_about_instance_rebuild(
  3249. context, instance, self.host,
  3250. phase=fields.NotificationPhase.END,
  3251. bdms=bdms)
  3252. def _handle_bad_volumes_detached(self, context, instance, bad_devices,
  3253. block_device_info):
  3254. """Handle cases where the virt-layer had to detach non-working volumes
  3255. in order to complete an operation.
  3256. """
  3257. for bdm in block_device_info['block_device_mapping']:
  3258. if bdm.get('mount_device') in bad_devices:
  3259. try:
  3260. volume_id = bdm['connection_info']['data']['volume_id']
  3261. except KeyError:
  3262. continue
  3263. # NOTE(sirp): ideally we'd just call
  3264. # `compute_api.detach_volume` here but since that hits the
  3265. # DB directly, that's off limits from within the
  3266. # compute-manager.
  3267. #
  3268. # API-detach
  3269. LOG.info("Detaching from volume api: %s", volume_id)
  3270. self.volume_api.begin_detaching(context, volume_id)
  3271. # Manager-detach
  3272. self.detach_volume(context, volume_id, instance)
  3273. @wrap_exception()
  3274. @reverts_task_state
  3275. @wrap_instance_event(prefix='compute')
  3276. @wrap_instance_fault
  3277. def reboot_instance(self, context, instance, block_device_info,
  3278. reboot_type):
  3279. """Reboot an instance on this host."""
  3280. # acknowledge the request made it to the manager
  3281. if reboot_type == "SOFT":
  3282. instance.task_state = task_states.REBOOT_PENDING
  3283. expected_states = task_states.soft_reboot_states
  3284. else:
  3285. instance.task_state = task_states.REBOOT_PENDING_HARD
  3286. expected_states = task_states.hard_reboot_states
  3287. context = context.elevated()
  3288. LOG.info("Rebooting instance", instance=instance)
  3289. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3290. context, instance.uuid)
  3291. block_device_info = self._get_instance_block_device_info(
  3292. context, instance, bdms=bdms)
  3293. network_info = self.network_api.get_instance_nw_info(context, instance)
  3294. self._notify_about_instance_usage(context, instance, "reboot.start")
  3295. compute_utils.notify_about_instance_action(
  3296. context, instance, self.host,
  3297. action=fields.NotificationAction.REBOOT,
  3298. phase=fields.NotificationPhase.START,
  3299. bdms=bdms
  3300. )
  3301. instance.power_state = self._get_power_state(context, instance)
  3302. instance.save(expected_task_state=expected_states)
  3303. if instance.power_state != power_state.RUNNING:
  3304. state = instance.power_state
  3305. running = power_state.RUNNING
  3306. LOG.warning('trying to reboot a non-running instance:'
  3307. ' (state: %(state)s expected: %(running)s)',
  3308. {'state': state, 'running': running},
  3309. instance=instance)
  3310. def bad_volumes_callback(bad_devices):
  3311. self._handle_bad_volumes_detached(
  3312. context, instance, bad_devices, block_device_info)
  3313. try:
  3314. # Don't change it out of rescue mode
  3315. if instance.vm_state == vm_states.RESCUED:
  3316. new_vm_state = vm_states.RESCUED
  3317. else:
  3318. new_vm_state = vm_states.ACTIVE
  3319. new_power_state = None
  3320. if reboot_type == "SOFT":
  3321. instance.task_state = task_states.REBOOT_STARTED
  3322. expected_state = task_states.REBOOT_PENDING
  3323. else:
  3324. instance.task_state = task_states.REBOOT_STARTED_HARD
  3325. expected_state = task_states.REBOOT_PENDING_HARD
  3326. instance.save(expected_task_state=expected_state)
  3327. self.driver.reboot(context, instance,
  3328. network_info,
  3329. reboot_type,
  3330. block_device_info=block_device_info,
  3331. bad_volumes_callback=bad_volumes_callback)
  3332. except Exception as error:
  3333. with excutils.save_and_reraise_exception() as ctxt:
  3334. exc_info = sys.exc_info()
  3335. # if the reboot failed but the VM is running don't
  3336. # put it into an error state
  3337. new_power_state = self._get_power_state(context, instance)
  3338. if new_power_state == power_state.RUNNING:
  3339. LOG.warning('Reboot failed but instance is running',
  3340. instance=instance)
  3341. compute_utils.add_instance_fault_from_exc(context,
  3342. instance, error, exc_info)
  3343. self._notify_about_instance_usage(context, instance,
  3344. 'reboot.error', fault=error)
  3345. tb = traceback.format_exc()
  3346. compute_utils.notify_about_instance_action(
  3347. context, instance, self.host,
  3348. action=fields.NotificationAction.REBOOT,
  3349. phase=fields.NotificationPhase.ERROR,
  3350. exception=error, bdms=bdms, tb=tb
  3351. )
  3352. ctxt.reraise = False
  3353. else:
  3354. LOG.error('Cannot reboot instance: %s', error,
  3355. instance=instance)
  3356. self._set_instance_obj_error_state(context, instance)
  3357. if not new_power_state:
  3358. new_power_state = self._get_power_state(context, instance)
  3359. try:
  3360. instance.power_state = new_power_state
  3361. instance.vm_state = new_vm_state
  3362. instance.task_state = None
  3363. instance.save()
  3364. except exception.InstanceNotFound:
  3365. LOG.warning("Instance disappeared during reboot",
  3366. instance=instance)
  3367. self._notify_about_instance_usage(context, instance, "reboot.end")
  3368. compute_utils.notify_about_instance_action(
  3369. context, instance, self.host,
  3370. action=fields.NotificationAction.REBOOT,
  3371. phase=fields.NotificationPhase.END,
  3372. bdms=bdms
  3373. )
  3374. @delete_image_on_error
  3375. def _do_snapshot_instance(self, context, image_id, instance):
  3376. self._snapshot_instance(context, image_id, instance,
  3377. task_states.IMAGE_BACKUP)
  3378. @wrap_exception()
  3379. @reverts_task_state
  3380. @wrap_instance_event(prefix='compute')
  3381. @wrap_instance_fault
  3382. def backup_instance(self, context, image_id, instance, backup_type,
  3383. rotation):
  3384. """Backup an instance on this host.
  3385. :param backup_type: daily | weekly
  3386. :param rotation: int representing how many backups to keep around
  3387. """
  3388. self._do_snapshot_instance(context, image_id, instance)
  3389. self._rotate_backups(context, instance, backup_type, rotation)
  3390. @wrap_exception()
  3391. @reverts_task_state
  3392. @wrap_instance_event(prefix='compute')
  3393. @wrap_instance_fault
  3394. @delete_image_on_error
  3395. def snapshot_instance(self, context, image_id, instance):
  3396. """Snapshot an instance on this host.
  3397. :param context: security context
  3398. :param image_id: glance.db.sqlalchemy.models.Image.Id
  3399. :param instance: a nova.objects.instance.Instance object
  3400. """
  3401. # NOTE(dave-mcnally) the task state will already be set by the api
  3402. # but if the compute manager has crashed/been restarted prior to the
  3403. # request getting here the task state may have been cleared so we set
  3404. # it again and things continue normally
  3405. try:
  3406. instance.task_state = task_states.IMAGE_SNAPSHOT
  3407. instance.save(
  3408. expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
  3409. except exception.InstanceNotFound:
  3410. # possibility instance no longer exists, no point in continuing
  3411. LOG.debug("Instance not found, could not set state %s "
  3412. "for instance.",
  3413. task_states.IMAGE_SNAPSHOT, instance=instance)
  3414. return
  3415. except exception.UnexpectedDeletingTaskStateError:
  3416. LOG.debug("Instance being deleted, snapshot cannot continue",
  3417. instance=instance)
  3418. return
  3419. self._snapshot_instance(context, image_id, instance,
  3420. task_states.IMAGE_SNAPSHOT)
  3421. def _snapshot_instance(self, context, image_id, instance,
  3422. expected_task_state):
  3423. context = context.elevated()
  3424. instance.power_state = self._get_power_state(context, instance)
  3425. try:
  3426. instance.save()
  3427. LOG.info('instance snapshotting', instance=instance)
  3428. if instance.power_state != power_state.RUNNING:
  3429. state = instance.power_state
  3430. running = power_state.RUNNING
  3431. LOG.warning('trying to snapshot a non-running instance: '
  3432. '(state: %(state)s expected: %(running)s)',
  3433. {'state': state, 'running': running},
  3434. instance=instance)
  3435. self._notify_about_instance_usage(
  3436. context, instance, "snapshot.start")
  3437. compute_utils.notify_about_instance_snapshot(context, instance,
  3438. self.host, phase=fields.NotificationPhase.START,
  3439. snapshot_image_id=image_id)
  3440. def update_task_state(task_state,
  3441. expected_state=expected_task_state):
  3442. instance.task_state = task_state
  3443. instance.save(expected_task_state=expected_state)
  3444. with timeutils.StopWatch() as timer:
  3445. self.driver.snapshot(context, instance, image_id,
  3446. update_task_state)
  3447. LOG.info('Took %0.2f seconds to snapshot the instance on '
  3448. 'the hypervisor.', timer.elapsed(), instance=instance)
  3449. instance.task_state = None
  3450. instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
  3451. self._notify_about_instance_usage(context, instance,
  3452. "snapshot.end")
  3453. compute_utils.notify_about_instance_snapshot(context, instance,
  3454. self.host, phase=fields.NotificationPhase.END,
  3455. snapshot_image_id=image_id)
  3456. except (exception.InstanceNotFound,
  3457. exception.UnexpectedDeletingTaskStateError):
  3458. # the instance got deleted during the snapshot
  3459. # Quickly bail out of here
  3460. msg = 'Instance disappeared during snapshot'
  3461. LOG.debug(msg, instance=instance)
  3462. try:
  3463. image = self.image_api.get(context, image_id)
  3464. if image['status'] != 'active':
  3465. self.image_api.delete(context, image_id)
  3466. except exception.ImageNotFound:
  3467. LOG.debug('Image not found during clean up %s', image_id)
  3468. except Exception:
  3469. LOG.warning("Error while trying to clean up image %s",
  3470. image_id, instance=instance)
  3471. except exception.ImageNotFound:
  3472. instance.task_state = None
  3473. instance.save()
  3474. LOG.warning("Image not found during snapshot", instance=instance)
  3475. def _post_interrupted_snapshot_cleanup(self, context, instance):
  3476. self.driver.post_interrupted_snapshot_cleanup(context, instance)
  3477. @messaging.expected_exceptions(NotImplementedError)
  3478. @wrap_exception()
  3479. def volume_snapshot_create(self, context, instance, volume_id,
  3480. create_info):
  3481. self.driver.volume_snapshot_create(context, instance, volume_id,
  3482. create_info)
  3483. @messaging.expected_exceptions(NotImplementedError)
  3484. @wrap_exception()
  3485. def volume_snapshot_delete(self, context, instance, volume_id,
  3486. snapshot_id, delete_info):
  3487. self.driver.volume_snapshot_delete(context, instance, volume_id,
  3488. snapshot_id, delete_info)
  3489. @wrap_instance_fault
  3490. def _rotate_backups(self, context, instance, backup_type, rotation):
  3491. """Delete excess backups associated to an instance.
  3492. Instances are allowed a fixed number of backups (the rotation number);
  3493. this method deletes the oldest backups that exceed the rotation
  3494. threshold.
  3495. :param context: security context
  3496. :param instance: Instance dict
  3497. :param backup_type: a user-defined type, like "daily" or "weekly" etc.
  3498. :param rotation: int representing how many backups to keep around;
  3499. None if rotation shouldn't be used (as in the case of snapshots)
  3500. """
  3501. filters = {'property-image_type': 'backup',
  3502. 'property-backup_type': backup_type,
  3503. 'property-instance_uuid': instance.uuid}
  3504. images = self.image_api.get_all(context, filters=filters,
  3505. sort_key='created_at', sort_dir='desc')
  3506. num_images = len(images)
  3507. LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
  3508. {'num_images': num_images, 'rotation': rotation},
  3509. instance=instance)
  3510. if num_images > rotation:
  3511. # NOTE(sirp): this deletes all backups that exceed the rotation
  3512. # limit
  3513. excess = len(images) - rotation
  3514. LOG.debug("Rotating out %d backups", excess,
  3515. instance=instance)
  3516. for i in range(excess):
  3517. image = images.pop()
  3518. image_id = image['id']
  3519. LOG.debug("Deleting image %s", image_id,
  3520. instance=instance)
  3521. try:
  3522. self.image_api.delete(context, image_id)
  3523. except exception.ImageNotFound:
  3524. LOG.info("Failed to find image %(image_id)s to "
  3525. "delete", {'image_id': image_id},
  3526. instance=instance)
  3527. except (exception.ImageDeleteConflict, Exception) as exc:
  3528. LOG.info("Failed to delete image %(image_id)s during "
  3529. "deleting excess backups. "
  3530. "Continuing for next image.. %(exc)s",
  3531. {'image_id': image_id, 'exc': exc},
  3532. instance=instance)
  3533. @wrap_exception()
  3534. @reverts_task_state
  3535. @wrap_instance_event(prefix='compute')
  3536. @wrap_instance_fault
  3537. def set_admin_password(self, context, instance, new_pass):
  3538. """Set the root/admin password for an instance on this host.
  3539. This is generally only called by API password resets after an
  3540. image has been built.
  3541. @param context: Nova auth context.
  3542. @param instance: Nova instance object.
  3543. @param new_pass: The admin password for the instance.
  3544. """
  3545. context = context.elevated()
  3546. current_power_state = self._get_power_state(context, instance)
  3547. expected_state = power_state.RUNNING
  3548. if current_power_state != expected_state:
  3549. instance.task_state = None
  3550. instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
  3551. _msg = _('instance %s is not running') % instance.uuid
  3552. raise exception.InstancePasswordSetFailed(
  3553. instance=instance.uuid, reason=_msg)
  3554. try:
  3555. self.driver.set_admin_password(instance, new_pass)
  3556. LOG.info("Admin password set", instance=instance)
  3557. instance.task_state = None
  3558. instance.save(
  3559. expected_task_state=task_states.UPDATING_PASSWORD)
  3560. except exception.InstanceAgentNotEnabled:
  3561. with excutils.save_and_reraise_exception():
  3562. LOG.debug('Guest agent is not enabled for the instance.',
  3563. instance=instance)
  3564. instance.task_state = None
  3565. instance.save(
  3566. expected_task_state=task_states.UPDATING_PASSWORD)
  3567. except exception.SetAdminPasswdNotSupported:
  3568. with excutils.save_and_reraise_exception():
  3569. LOG.info('set_admin_password is not supported '
  3570. 'by this driver or guest instance.',
  3571. instance=instance)
  3572. instance.task_state = None
  3573. instance.save(
  3574. expected_task_state=task_states.UPDATING_PASSWORD)
  3575. except NotImplementedError:
  3576. LOG.warning('set_admin_password is not implemented '
  3577. 'by this driver or guest instance.',
  3578. instance=instance)
  3579. instance.task_state = None
  3580. instance.save(
  3581. expected_task_state=task_states.UPDATING_PASSWORD)
  3582. raise NotImplementedError(_('set_admin_password is not '
  3583. 'implemented by this driver or guest '
  3584. 'instance.'))
  3585. except exception.UnexpectedTaskStateError:
  3586. # interrupted by another (most likely delete) task
  3587. # do not retry
  3588. raise
  3589. except Exception:
  3590. # Catch all here because this could be anything.
  3591. LOG.exception('set_admin_password failed', instance=instance)
  3592. # We create a new exception here so that we won't
  3593. # potentially reveal password information to the
  3594. # API caller. The real exception is logged above
  3595. _msg = _('error setting admin password')
  3596. raise exception.InstancePasswordSetFailed(
  3597. instance=instance.uuid, reason=_msg)
  3598. @wrap_exception()
  3599. @reverts_task_state
  3600. @wrap_instance_fault
  3601. def inject_file(self, context, path, file_contents, instance):
  3602. """Write a file to the specified path in an instance on this host."""
  3603. # NOTE(russellb) Remove this method, as well as the underlying virt
  3604. # driver methods, when the compute rpc interface is bumped to 4.x
  3605. # as it is no longer used.
  3606. context = context.elevated()
  3607. current_power_state = self._get_power_state(context, instance)
  3608. expected_state = power_state.RUNNING
  3609. if current_power_state != expected_state:
  3610. LOG.warning('trying to inject a file into a non-running '
  3611. '(state: %(current_state)s expected: '
  3612. '%(expected_state)s)',
  3613. {'current_state': current_power_state,
  3614. 'expected_state': expected_state},
  3615. instance=instance)
  3616. LOG.info('injecting file to %s', path, instance=instance)
  3617. self.driver.inject_file(instance, path, file_contents)
  3618. def _get_rescue_image(self, context, instance, rescue_image_ref=None):
  3619. """Determine what image should be used to boot the rescue VM."""
  3620. # 1. If rescue_image_ref is passed in, use that for rescue.
  3621. # 2. Else, use the base image associated with instance's current image.
  3622. # The idea here is to provide the customer with a rescue
  3623. # environment which they are familiar with.
  3624. # So, if they built their instance off of a Debian image,
  3625. # their rescue VM will also be Debian.
  3626. # 3. As a last resort, use instance's current image.
  3627. if not rescue_image_ref:
  3628. system_meta = utils.instance_sys_meta(instance)
  3629. rescue_image_ref = system_meta.get('image_base_image_ref')
  3630. if not rescue_image_ref:
  3631. LOG.warning('Unable to find a different image to use for '
  3632. 'rescue VM, using instance\'s current image',
  3633. instance=instance)
  3634. rescue_image_ref = instance.image_ref
  3635. return objects.ImageMeta.from_image_ref(
  3636. context, self.image_api, rescue_image_ref)
  3637. @wrap_exception()
  3638. @reverts_task_state
  3639. @wrap_instance_event(prefix='compute')
  3640. @wrap_instance_fault
  3641. def rescue_instance(self, context, instance, rescue_password,
  3642. rescue_image_ref, clean_shutdown):
  3643. context = context.elevated()
  3644. LOG.info('Rescuing', instance=instance)
  3645. admin_password = (rescue_password if rescue_password else
  3646. utils.generate_password())
  3647. network_info = self.network_api.get_instance_nw_info(context, instance)
  3648. rescue_image_meta = self._get_rescue_image(context, instance,
  3649. rescue_image_ref)
  3650. extra_usage_info = {'rescue_image_name':
  3651. self._get_image_name(rescue_image_meta)}
  3652. self._notify_about_instance_usage(context, instance,
  3653. "rescue.start", extra_usage_info=extra_usage_info,
  3654. network_info=network_info)
  3655. compute_utils.notify_about_instance_rescue_action(
  3656. context, instance, self.host, rescue_image_ref,
  3657. phase=fields.NotificationPhase.START)
  3658. try:
  3659. self._power_off_instance(context, instance, clean_shutdown)
  3660. self.driver.rescue(context, instance,
  3661. network_info,
  3662. rescue_image_meta, admin_password)
  3663. except Exception as e:
  3664. LOG.exception("Error trying to Rescue Instance",
  3665. instance=instance)
  3666. self._set_instance_obj_error_state(context, instance)
  3667. raise exception.InstanceNotRescuable(
  3668. instance_id=instance.uuid,
  3669. reason=_("Driver Error: %s") % e)
  3670. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3671. self.host, current_period=True)
  3672. instance.vm_state = vm_states.RESCUED
  3673. instance.task_state = None
  3674. instance.power_state = self._get_power_state(context, instance)
  3675. instance.launched_at = timeutils.utcnow()
  3676. instance.save(expected_task_state=task_states.RESCUING)
  3677. self._notify_about_instance_usage(context, instance,
  3678. "rescue.end", extra_usage_info=extra_usage_info,
  3679. network_info=network_info)
  3680. compute_utils.notify_about_instance_rescue_action(
  3681. context, instance, self.host, rescue_image_ref,
  3682. phase=fields.NotificationPhase.END)
  3683. @wrap_exception()
  3684. @reverts_task_state
  3685. @wrap_instance_event(prefix='compute')
  3686. @wrap_instance_fault
  3687. def unrescue_instance(self, context, instance):
  3688. context = context.elevated()
  3689. LOG.info('Unrescuing', instance=instance)
  3690. network_info = self.network_api.get_instance_nw_info(context, instance)
  3691. self._notify_about_instance_usage(context, instance,
  3692. "unrescue.start", network_info=network_info)
  3693. compute_utils.notify_about_instance_action(context, instance,
  3694. self.host, action=fields.NotificationAction.UNRESCUE,
  3695. phase=fields.NotificationPhase.START)
  3696. with self._error_out_instance_on_exception(context, instance):
  3697. self.driver.unrescue(instance,
  3698. network_info)
  3699. instance.vm_state = vm_states.ACTIVE
  3700. instance.task_state = None
  3701. instance.power_state = self._get_power_state(context, instance)
  3702. instance.save(expected_task_state=task_states.UNRESCUING)
  3703. self._notify_about_instance_usage(context,
  3704. instance,
  3705. "unrescue.end",
  3706. network_info=network_info)
  3707. compute_utils.notify_about_instance_action(context, instance,
  3708. self.host, action=fields.NotificationAction.UNRESCUE,
  3709. phase=fields.NotificationPhase.END)
  3710. @wrap_exception()
  3711. @wrap_instance_fault
  3712. def change_instance_metadata(self, context, diff, instance):
  3713. """Update the metadata published to the instance."""
  3714. LOG.debug("Changing instance metadata according to %r",
  3715. diff, instance=instance)
  3716. self.driver.change_instance_metadata(context, instance, diff)
  3717. @wrap_exception()
  3718. @wrap_instance_event(prefix='compute')
  3719. @errors_out_migration
  3720. @wrap_instance_fault
  3721. def confirm_resize(self, context, instance, migration):
  3722. """Confirms a migration/resize and deletes the 'old' instance.
  3723. This is called from the API and runs on the source host.
  3724. Nothing needs to happen on the destination host at this point since
  3725. the instance is already running there. This routine just cleans up the
  3726. source host.
  3727. """
  3728. @utils.synchronized(instance.uuid)
  3729. def do_confirm_resize(context, instance, migration_id):
  3730. # NOTE(wangpan): Get the migration status from db, if it has been
  3731. # confirmed, we do nothing and return here
  3732. LOG.debug("Going to confirm migration %s", migration_id,
  3733. instance=instance)
  3734. try:
  3735. # TODO(russellb) Why are we sending the migration object just
  3736. # to turn around and look it up from the db again?
  3737. migration = objects.Migration.get_by_id(
  3738. context.elevated(), migration_id)
  3739. except exception.MigrationNotFound:
  3740. LOG.error("Migration %s is not found during confirmation",
  3741. migration_id, instance=instance)
  3742. return
  3743. if migration.status == 'confirmed':
  3744. LOG.info("Migration %s is already confirmed",
  3745. migration_id, instance=instance)
  3746. return
  3747. elif migration.status not in ('finished', 'confirming'):
  3748. LOG.warning("Unexpected confirmation status '%(status)s' "
  3749. "of migration %(id)s, exit confirmation process",
  3750. {"status": migration.status, "id": migration_id},
  3751. instance=instance)
  3752. return
  3753. # NOTE(wangpan): Get the instance from db, if it has been
  3754. # deleted, we do nothing and return here
  3755. expected_attrs = ['metadata', 'system_metadata', 'flavor']
  3756. try:
  3757. instance = objects.Instance.get_by_uuid(
  3758. context, instance.uuid,
  3759. expected_attrs=expected_attrs)
  3760. except exception.InstanceNotFound:
  3761. LOG.info("Instance is not found during confirmation",
  3762. instance=instance)
  3763. return
  3764. with self._error_out_instance_on_exception(context, instance):
  3765. try:
  3766. self._confirm_resize(
  3767. context, instance, migration=migration)
  3768. except Exception:
  3769. # Something failed when cleaning up the source host so
  3770. # log a traceback and leave a hint about hard rebooting
  3771. # the server to correct its state in the DB.
  3772. with excutils.save_and_reraise_exception(logger=LOG):
  3773. LOG.exception(
  3774. 'Confirm resize failed on source host %s. '
  3775. 'Resource allocations in the placement service '
  3776. 'will be removed regardless because the instance '
  3777. 'is now on the destination host %s. You can try '
  3778. 'hard rebooting the instance to correct its '
  3779. 'state.', self.host, migration.dest_compute,
  3780. instance=instance)
  3781. finally:
  3782. # Whether an error occurred or not, at this point the
  3783. # instance is on the dest host so to avoid leaking
  3784. # allocations in placement, delete them here.
  3785. self._delete_allocation_after_move(
  3786. context, instance, migration)
  3787. do_confirm_resize(context, instance, migration.id)
  3788. def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping):
  3789. # NOTE(adrianc): This method returns a copy of nw_info if modifications
  3790. # are made else it returns the original nw_info.
  3791. updated_nw_info = nw_info
  3792. if nw_info and pci_mapping:
  3793. updated_nw_info = copy.deepcopy(nw_info)
  3794. for vif in updated_nw_info:
  3795. if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV:
  3796. try:
  3797. vif_pci_addr = vif['profile']['pci_slot']
  3798. new_addr = pci_mapping[vif_pci_addr].address
  3799. vif['profile']['pci_slot'] = new_addr
  3800. LOG.debug("Updating VIF's PCI address for VIF %(id)s. "
  3801. "Original value %(orig_val)s, "
  3802. "new value %(new_val)s",
  3803. {'id': vif['id'],
  3804. 'orig_val': vif_pci_addr,
  3805. 'new_val': new_addr})
  3806. except (KeyError, AttributeError):
  3807. with excutils.save_and_reraise_exception():
  3808. # NOTE(adrianc): This should never happen. If we
  3809. # get here it means there is some inconsistency
  3810. # with either 'nw_info' or 'pci_mapping'.
  3811. LOG.error("Unexpected error when updating network "
  3812. "information with PCI mapping.")
  3813. return updated_nw_info
  3814. def _confirm_resize(self, context, instance, migration=None):
  3815. """Destroys the source instance."""
  3816. self._notify_about_instance_usage(context, instance,
  3817. "resize.confirm.start")
  3818. compute_utils.notify_about_instance_action(context, instance,
  3819. self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
  3820. phase=fields.NotificationPhase.START)
  3821. # NOTE(danms): delete stashed migration information
  3822. old_instance_type = instance.old_flavor
  3823. instance.old_flavor = None
  3824. instance.new_flavor = None
  3825. instance.system_metadata.pop('old_vm_state', None)
  3826. instance.save()
  3827. # NOTE(tr3buchet): tear down networks on source host
  3828. self.network_api.setup_networks_on_host(context, instance,
  3829. migration.source_compute, teardown=True)
  3830. network_info = self.network_api.get_instance_nw_info(context,
  3831. instance)
  3832. # NOTE(adrianc): Populate old PCI device in VIF profile
  3833. # to allow virt driver to properly unplug it from Hypervisor.
  3834. pci_mapping = (instance.migration_context.
  3835. get_pci_mapping_for_migration(True))
  3836. network_info = self._get_updated_nw_info_with_pci_mapping(
  3837. network_info, pci_mapping)
  3838. self.driver.confirm_migration(context, migration, instance,
  3839. network_info)
  3840. migration.status = 'confirmed'
  3841. migration.save()
  3842. # NOTE(mriedem): drop_move_claim relies on
  3843. # instance.migration_context so make sure to not call
  3844. # instance.drop_migration_context() until after drop_move_claim
  3845. # is called.
  3846. self.rt.drop_move_claim(context, instance, migration.source_node,
  3847. old_instance_type, prefix='old_')
  3848. instance.drop_migration_context()
  3849. # NOTE(mriedem): The old_vm_state could be STOPPED but the user
  3850. # might have manually powered up the instance to confirm the
  3851. # resize/migrate, so we need to check the current power state
  3852. # on the instance and set the vm_state appropriately. We default
  3853. # to ACTIVE because if the power state is not SHUTDOWN, we
  3854. # assume _sync_instance_power_state will clean it up.
  3855. p_state = instance.power_state
  3856. vm_state = None
  3857. if p_state == power_state.SHUTDOWN:
  3858. vm_state = vm_states.STOPPED
  3859. LOG.debug("Resized/migrated instance is powered off. "
  3860. "Setting vm_state to '%s'.", vm_state,
  3861. instance=instance)
  3862. else:
  3863. vm_state = vm_states.ACTIVE
  3864. instance.vm_state = vm_state
  3865. instance.task_state = None
  3866. instance.save(expected_task_state=[None, task_states.DELETING,
  3867. task_states.SOFT_DELETING])
  3868. self._notify_about_instance_usage(
  3869. context, instance, "resize.confirm.end",
  3870. network_info=network_info)
  3871. compute_utils.notify_about_instance_action(context, instance,
  3872. self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
  3873. phase=fields.NotificationPhase.END)
  3874. def _delete_allocation_after_move(self, context, instance, migration):
  3875. """Deletes resource allocations held by the migration record against
  3876. the source compute node resource provider after a confirmed cold /
  3877. successful live migration.
  3878. """
  3879. try:
  3880. # NOTE(danms): We're finishing on the source node, so try
  3881. # to delete the allocation based on the migration uuid
  3882. self.reportclient.delete_allocation_for_instance(
  3883. context, migration.uuid, consumer_type='migration')
  3884. except exception.AllocationDeleteFailed:
  3885. LOG.error('Deleting allocation in placement for migration '
  3886. '%(migration_uuid)s failed. The instance '
  3887. '%(instance_uuid)s will be put to ERROR state '
  3888. 'but the allocation held by the migration is '
  3889. 'leaked.',
  3890. {'instance_uuid': instance.uuid,
  3891. 'migration_uuid': migration.uuid})
  3892. raise
  3893. @wrap_exception()
  3894. @reverts_task_state
  3895. @wrap_instance_event(prefix='compute')
  3896. @errors_out_migration
  3897. @wrap_instance_fault
  3898. def revert_resize(self, context, instance, migration, request_spec=None):
  3899. """Destroys the new instance on the destination machine.
  3900. Reverts the model changes, and powers on the old instance on the
  3901. source machine.
  3902. """
  3903. # NOTE(comstud): A revert_resize is essentially a resize back to
  3904. # the old size, so we need to send a usage event here.
  3905. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3906. self.host, current_period=True)
  3907. with self._error_out_instance_on_exception(context, instance):
  3908. # NOTE(tr3buchet): tear down networks on destination host
  3909. self.network_api.setup_networks_on_host(context, instance,
  3910. teardown=True)
  3911. self.network_api.migrate_instance_start(context,
  3912. instance,
  3913. migration)
  3914. network_info = self.network_api.get_instance_nw_info(context,
  3915. instance)
  3916. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3917. context, instance.uuid)
  3918. block_device_info = self._get_instance_block_device_info(
  3919. context, instance, bdms=bdms)
  3920. destroy_disks = not self._is_instance_storage_shared(
  3921. context, instance, host=migration.source_compute)
  3922. self.driver.destroy(context, instance, network_info,
  3923. block_device_info, destroy_disks)
  3924. self._terminate_volume_connections(context, instance, bdms)
  3925. migration.status = 'reverted'
  3926. migration.save()
  3927. # NOTE(ndipanov): We need to do this here because dropping the
  3928. # claim means we lose the migration_context data. We really should
  3929. # fix this by moving the drop_move_claim call to the
  3930. # finish_revert_resize method as this is racy (revert is dropped,
  3931. # but instance resources will be tracked with the new flavor until
  3932. # it gets rolled back in finish_revert_resize, which is
  3933. # potentially wrong for a period of time).
  3934. instance.revert_migration_context()
  3935. instance.save()
  3936. self.rt.drop_move_claim(context, instance, instance.node)
  3937. # RPC cast back to the source host to finish the revert there.
  3938. self.compute_rpcapi.finish_revert_resize(context, instance,
  3939. migration, migration.source_compute, request_spec)
  3940. def _finish_revert_resize_network_migrate_finish(
  3941. self, context, instance, migration, provider_mappings):
  3942. """Causes port binding to be updated. In some Neutron or port
  3943. configurations - see NetworkModel.get_bind_time_events() - we
  3944. expect the vif-plugged event from Neutron immediately and wait for it.
  3945. The rest of the time, the event is expected further along in the
  3946. virt driver, so we don't wait here.
  3947. :param context: The request context.
  3948. :param instance: The instance undergoing the revert resize.
  3949. :param migration: The Migration object of the resize being reverted.
  3950. :param provider_mappings: a dict of list of resource provider uuids
  3951. keyed by port uuid
  3952. :raises: eventlet.timeout.Timeout or
  3953. exception.VirtualInterfacePlugException.
  3954. """
  3955. network_info = instance.get_network_info()
  3956. events = []
  3957. deadline = CONF.vif_plugging_timeout
  3958. if deadline and utils.is_neutron() and network_info:
  3959. events = network_info.get_bind_time_events(migration)
  3960. if events:
  3961. LOG.debug('Will wait for bind-time events: %s', events)
  3962. error_cb = self._neutron_failed_migration_callback
  3963. try:
  3964. with self.virtapi.wait_for_instance_event(instance, events,
  3965. deadline=deadline,
  3966. error_callback=error_cb):
  3967. # NOTE(hanrong): we need to change migration.dest_compute to
  3968. # source host temporarily.
  3969. # "network_api.migrate_instance_finish" will setup the network
  3970. # for the instance on the destination host. For revert resize,
  3971. # the instance will back to the source host, the setup of the
  3972. # network for instance should be on the source host. So set
  3973. # the migration.dest_compute to source host at here.
  3974. with utils.temporary_mutation(
  3975. migration, dest_compute=migration.source_compute):
  3976. self.network_api.migrate_instance_finish(
  3977. context, instance, migration, provider_mappings)
  3978. except eventlet.timeout.Timeout:
  3979. with excutils.save_and_reraise_exception():
  3980. LOG.error('Timeout waiting for Neutron events: %s', events,
  3981. instance=instance)
  3982. @wrap_exception()
  3983. @reverts_task_state
  3984. @wrap_instance_event(prefix='compute')
  3985. @errors_out_migration
  3986. @wrap_instance_fault
  3987. def finish_revert_resize(
  3988. self, context, instance, migration, request_spec=None):
  3989. """Finishes the second half of reverting a resize on the source host.
  3990. Bring the original source instance state back (active/shutoff) and
  3991. revert the resized attributes in the database.
  3992. """
  3993. with self._error_out_instance_on_exception(context, instance):
  3994. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3995. context, instance.uuid)
  3996. self._notify_about_instance_usage(
  3997. context, instance, "resize.revert.start")
  3998. compute_utils.notify_about_instance_action(context, instance,
  3999. self.host, action=fields.NotificationAction.RESIZE_REVERT,
  4000. phase=fields.NotificationPhase.START, bdms=bdms)
  4001. # NOTE(mriedem): delete stashed old_vm_state information; we
  4002. # default to ACTIVE for backwards compatibility if old_vm_state
  4003. # is not set
  4004. old_vm_state = instance.system_metadata.pop('old_vm_state',
  4005. vm_states.ACTIVE)
  4006. self._set_instance_info(instance, instance.old_flavor)
  4007. instance.old_flavor = None
  4008. instance.new_flavor = None
  4009. instance.host = migration.source_compute
  4010. instance.node = migration.source_node
  4011. instance.save()
  4012. try:
  4013. source_allocations = self._revert_allocation(
  4014. context, instance, migration)
  4015. except exception.AllocationMoveFailed:
  4016. LOG.error('Reverting allocation in placement for migration '
  4017. '%(migration_uuid)s failed. The instance '
  4018. '%(instance_uuid)s will be put into ERROR state but '
  4019. 'the allocation held by the migration is leaked.',
  4020. {'instance_uuid': instance.uuid,
  4021. 'migration_uuid': migration.uuid})
  4022. raise
  4023. provider_mappings = self._fill_provider_mapping_based_on_allocs(
  4024. context, source_allocations, request_spec)
  4025. self.network_api.setup_networks_on_host(context, instance,
  4026. migration.source_compute)
  4027. self._finish_revert_resize_network_migrate_finish(
  4028. context, instance, migration, provider_mappings)
  4029. network_info = self.network_api.get_instance_nw_info(context,
  4030. instance)
  4031. # revert_resize deleted any volume attachments for the instance
  4032. # and created new ones to be used on this host, but we
  4033. # have to update those attachments with the host connector so the
  4034. # BDM.connection_info will get set in the call to
  4035. # _get_instance_block_device_info below with refresh_conn_info=True
  4036. # and then the volumes can be re-connected via the driver on this
  4037. # host.
  4038. self._update_volume_attachments(context, instance, bdms)
  4039. block_device_info = self._get_instance_block_device_info(
  4040. context, instance, refresh_conn_info=True, bdms=bdms)
  4041. power_on = old_vm_state != vm_states.STOPPED
  4042. self.driver.finish_revert_migration(
  4043. context, instance, network_info, migration, block_device_info,
  4044. power_on)
  4045. instance.drop_migration_context()
  4046. instance.launched_at = timeutils.utcnow()
  4047. instance.save(expected_task_state=task_states.RESIZE_REVERTING)
  4048. # Complete any volume attachments so the volumes are in-use.
  4049. self._complete_volume_attachments(context, bdms)
  4050. # if the original vm state was STOPPED, set it back to STOPPED
  4051. LOG.info("Updating instance to original state: '%s'",
  4052. old_vm_state, instance=instance)
  4053. if power_on:
  4054. instance.vm_state = vm_states.ACTIVE
  4055. instance.task_state = None
  4056. instance.save()
  4057. else:
  4058. instance.task_state = task_states.POWERING_OFF
  4059. instance.save()
  4060. self.stop_instance(context, instance=instance,
  4061. clean_shutdown=True)
  4062. self._notify_about_instance_usage(
  4063. context, instance, "resize.revert.end")
  4064. compute_utils.notify_about_instance_action(context, instance,
  4065. self.host, action=fields.NotificationAction.RESIZE_REVERT,
  4066. phase=fields.NotificationPhase.END, bdms=bdms)
  4067. def _fill_provider_mapping_based_on_allocs(
  4068. self, context, allocations, request_spec):
  4069. """Fills and returns the request group - resource provider mapping
  4070. based on the allocation passed in.
  4071. :param context: The security context
  4072. :param allocation: allocation dict keyed by RP UUID.
  4073. :param request_spec: The RequestSpec object associated with the
  4074. operation
  4075. :returns: None if the request_spec is None. Otherwise a mapping
  4076. between RequestGroup requester_id, currently Neutron port_id,
  4077. and a list of resource provider UUIDs providing resource for
  4078. that RequestGroup.
  4079. """
  4080. if request_spec:
  4081. # NOTE(gibi): We need to re-calculate the resource provider -
  4082. # port mapping as we have to have the neutron ports allocate
  4083. # from the source compute after revert.
  4084. scheduler_utils.fill_provider_mapping_based_on_allocation(
  4085. context, self.reportclient, request_spec, allocations)
  4086. provider_mappings = self._get_request_group_mapping(
  4087. request_spec)
  4088. else:
  4089. # NOTE(gibi): The compute RPC is pinned to be older than 5.2
  4090. # and therefore request_spec is not sent. We cannot calculate
  4091. # the provider mappings. If the instance has ports with
  4092. # resource request then the port update will fail in
  4093. # _update_port_binding_for_instance() called via
  4094. # _finish_revert_resize_network_migrate_finish() in
  4095. # finish_revert_resize.
  4096. provider_mappings = None
  4097. return provider_mappings
  4098. def _revert_allocation(self, context, instance, migration):
  4099. """Revert an allocation that is held by migration to our instance."""
  4100. # Fetch the original allocation that the instance had on the source
  4101. # node, which are now held by the migration
  4102. orig_alloc = self.reportclient.get_allocations_for_consumer(
  4103. context, migration.uuid)
  4104. if not orig_alloc:
  4105. LOG.error('Did not find resource allocations for migration '
  4106. '%s on source node %s. Unable to revert source node '
  4107. 'allocations back to the instance.',
  4108. migration.uuid, migration.source_node, instance=instance)
  4109. return False
  4110. LOG.info('Swapping old allocation on %(rp_uuids)s held by migration '
  4111. '%(mig)s for instance',
  4112. {'rp_uuids': orig_alloc.keys(), 'mig': migration.uuid},
  4113. instance=instance)
  4114. # FIXME(gibi): This method is flawed in that it does not handle
  4115. # allocations against sharing providers in any special way. This leads
  4116. # to duplicate allocations against the sharing provider during
  4117. # migration.
  4118. # TODO(cdent): Should we be doing anything with return values here?
  4119. self.reportclient.move_allocations(context, migration.uuid,
  4120. instance.uuid)
  4121. return orig_alloc
  4122. def _prep_resize(self, context, image, instance, instance_type,
  4123. filter_properties, node, migration, request_spec,
  4124. clean_shutdown=True):
  4125. if not filter_properties:
  4126. filter_properties = {}
  4127. if not instance.host:
  4128. self._set_instance_obj_error_state(context, instance)
  4129. msg = _('Instance has no source host')
  4130. raise exception.MigrationError(reason=msg)
  4131. same_host = instance.host == self.host
  4132. # if the flavor IDs match, it's migrate; otherwise resize
  4133. if same_host and instance_type.id == instance['instance_type_id']:
  4134. # check driver whether support migrate to same host
  4135. if not self.driver.capabilities.get(
  4136. 'supports_migrate_to_same_host', False):
  4137. # Raise InstanceFaultRollback so that the
  4138. # _error_out_instance_on_exception context manager in
  4139. # prep_resize will set the instance.vm_state properly.
  4140. raise exception.InstanceFaultRollback(
  4141. inner_exception=exception.UnableToMigrateToSelf(
  4142. instance_id=instance.uuid, host=self.host))
  4143. # NOTE(danms): Stash the new instance_type to avoid having to
  4144. # look it up in the database later
  4145. instance.new_flavor = instance_type
  4146. # NOTE(mriedem): Stash the old vm_state so we can set the
  4147. # resized/reverted instance back to the same state later.
  4148. vm_state = instance.vm_state
  4149. LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
  4150. instance.system_metadata['old_vm_state'] = vm_state
  4151. instance.save()
  4152. if not isinstance(request_spec, objects.RequestSpec):
  4153. # Prior to compute RPC API 5.1 conductor would pass a legacy dict
  4154. # version of the request spec to compute and since Stein compute
  4155. # could be sending that back to conductor on reschedule, so if we
  4156. # got a dict convert it to an object.
  4157. # TODO(mriedem): We can drop this compat code when we only support
  4158. # compute RPC API >=6.0.
  4159. request_spec = objects.RequestSpec.from_primitives(
  4160. context, request_spec, filter_properties)
  4161. # We don't have to set the new flavor on the request spec because
  4162. # if we got here it was due to a reschedule from the compute and
  4163. # the request spec would already have the new flavor in it from the
  4164. # else block below.
  4165. request_group_resource_providers_mapping = \
  4166. self._get_request_group_mapping(request_spec)
  4167. if request_group_resource_providers_mapping:
  4168. self._update_pci_request_spec_with_allocated_interface_name(
  4169. context, instance, request_group_resource_providers_mapping)
  4170. limits = filter_properties.get('limits', {})
  4171. allocs = self.reportclient.get_allocations_for_consumer(
  4172. context, instance.uuid)
  4173. with self.rt.resize_claim(context, instance, instance_type, node,
  4174. migration, allocs, image_meta=image,
  4175. limits=limits) as claim:
  4176. LOG.info('Migrating', instance=instance)
  4177. # RPC cast to the source host to start the actual resize/migration.
  4178. self.compute_rpcapi.resize_instance(
  4179. context, instance, claim.migration, image,
  4180. instance_type, request_spec, clean_shutdown)
  4181. def _send_prep_resize_notifications(
  4182. self, context, instance, phase, flavor):
  4183. """Send "resize.prep.*" notifications.
  4184. :param context: nova auth request context
  4185. :param instance: The instance being resized
  4186. :param phase: The phase of the action (NotificationPhase enum)
  4187. :param flavor: The (new) flavor for the resize (same as existing
  4188. instance.flavor for a cold migration)
  4189. """
  4190. # Only send notify_usage_exists if it's the "start" phase.
  4191. if phase == fields.NotificationPhase.START:
  4192. compute_utils.notify_usage_exists(
  4193. self.notifier, context, instance, self.host,
  4194. current_period=True)
  4195. # Send extra usage info about the flavor if it's the "end" phase for
  4196. # the legacy unversioned notification.
  4197. extra_usage_info = None
  4198. if phase == fields.NotificationPhase.END:
  4199. extra_usage_info = dict(
  4200. new_instance_type=flavor.name,
  4201. new_instance_type_id=flavor.id)
  4202. self._notify_about_instance_usage(
  4203. context, instance, "resize.prep.%s" % phase,
  4204. extra_usage_info=extra_usage_info)
  4205. # Send the versioned notification.
  4206. compute_utils.notify_about_resize_prep_instance(
  4207. context, instance, self.host, phase, flavor)
  4208. @wrap_exception()
  4209. @reverts_task_state
  4210. @wrap_instance_event(prefix='compute')
  4211. @wrap_instance_fault
  4212. def prep_resize(self, context, image, instance, instance_type,
  4213. request_spec, filter_properties, node,
  4214. clean_shutdown, migration, host_list):
  4215. """Initiates the process of moving a running instance to another host.
  4216. Possibly changes the VCPU, RAM and disk size in the process.
  4217. This is initiated from conductor and runs on the destination host.
  4218. The main purpose of this method is performing some checks on the
  4219. destination host and making a claim for resources. If the claim fails
  4220. then a reschedule to another host may be attempted which involves
  4221. calling back to conductor to start the process over again.
  4222. """
  4223. if node is None:
  4224. node = self._get_nodename(instance, refresh=True)
  4225. # Pass instance_state=instance.vm_state because we can resize
  4226. # a STOPPED server and we don't want to set it back to ACTIVE
  4227. # in case _prep_resize fails.
  4228. instance_state = instance.vm_state
  4229. with self._error_out_instance_on_exception(
  4230. context, instance, instance_state=instance_state),\
  4231. errors_out_migration_ctxt(migration):
  4232. self._send_prep_resize_notifications(
  4233. context, instance, fields.NotificationPhase.START,
  4234. instance_type)
  4235. try:
  4236. self._prep_resize(context, image, instance,
  4237. instance_type, filter_properties,
  4238. node, migration, request_spec,
  4239. clean_shutdown)
  4240. except exception.BuildAbortException:
  4241. # NOTE(gibi): We failed
  4242. # _update_pci_request_spec_with_allocated_interface_name so
  4243. # there is no reason to re-schedule. Just revert the allocation
  4244. # and fail the migration.
  4245. with excutils.save_and_reraise_exception():
  4246. self._revert_allocation(context, instance, migration)
  4247. except Exception:
  4248. # Since we hit a failure, we're either rescheduling or dead
  4249. # and either way we need to cleanup any allocations created
  4250. # by the scheduler for the destination node.
  4251. self._revert_allocation(context, instance, migration)
  4252. # try to re-schedule the resize elsewhere:
  4253. exc_info = sys.exc_info()
  4254. self._reschedule_resize_or_reraise(context, instance,
  4255. exc_info, instance_type, request_spec,
  4256. filter_properties, host_list)
  4257. finally:
  4258. self._send_prep_resize_notifications(
  4259. context, instance, fields.NotificationPhase.END,
  4260. instance_type)
  4261. def _reschedule_resize_or_reraise(self, context, instance, exc_info,
  4262. instance_type, request_spec, filter_properties, host_list):
  4263. """Try to re-schedule the resize or re-raise the original error to
  4264. error out the instance.
  4265. """
  4266. if not filter_properties:
  4267. filter_properties = {}
  4268. rescheduled = False
  4269. instance_uuid = instance.uuid
  4270. try:
  4271. retry = filter_properties.get('retry')
  4272. if retry:
  4273. LOG.debug('Rescheduling, attempt %d', retry['num_attempts'],
  4274. instance_uuid=instance_uuid)
  4275. # reset the task state
  4276. task_state = task_states.RESIZE_PREP
  4277. self._instance_update(context, instance, task_state=task_state)
  4278. if exc_info:
  4279. # stringify to avoid circular ref problem in json
  4280. # serialization
  4281. retry['exc'] = traceback.format_exception_only(
  4282. exc_info[0], exc_info[1])
  4283. scheduler_hint = {'filter_properties': filter_properties}
  4284. self.compute_task_api.resize_instance(
  4285. context, instance, scheduler_hint, instance_type,
  4286. request_spec=request_spec, host_list=host_list)
  4287. rescheduled = True
  4288. else:
  4289. # no retry information, do not reschedule.
  4290. LOG.debug('Retry info not present, will not reschedule',
  4291. instance_uuid=instance_uuid)
  4292. rescheduled = False
  4293. except Exception as error:
  4294. rescheduled = False
  4295. LOG.exception("Error trying to reschedule",
  4296. instance_uuid=instance_uuid)
  4297. compute_utils.add_instance_fault_from_exc(context,
  4298. instance, error,
  4299. exc_info=sys.exc_info())
  4300. self._notify_about_instance_usage(context, instance,
  4301. 'resize.error', fault=error)
  4302. compute_utils.notify_about_instance_action(
  4303. context, instance, self.host,
  4304. action=fields.NotificationAction.RESIZE,
  4305. phase=fields.NotificationPhase.ERROR,
  4306. exception=error,
  4307. tb=','.join(traceback.format_exception(*exc_info)))
  4308. if rescheduled:
  4309. self._log_original_error(exc_info, instance_uuid)
  4310. compute_utils.add_instance_fault_from_exc(context,
  4311. instance, exc_info[1], exc_info=exc_info)
  4312. self._notify_about_instance_usage(context, instance,
  4313. 'resize.error', fault=exc_info[1])
  4314. compute_utils.notify_about_instance_action(
  4315. context, instance, self.host,
  4316. action=fields.NotificationAction.RESIZE,
  4317. phase=fields.NotificationPhase.ERROR,
  4318. exception=exc_info[1],
  4319. tb=','.join(traceback.format_exception(*exc_info)))
  4320. else:
  4321. # not re-scheduling
  4322. six.reraise(*exc_info)
  4323. @messaging.expected_exceptions(exception.MigrationPreCheckError)
  4324. @wrap_exception()
  4325. @wrap_instance_event(prefix='compute')
  4326. @wrap_instance_fault
  4327. def prep_snapshot_based_resize_at_dest(
  4328. self, ctxt, instance, flavor, nodename, migration, limits,
  4329. request_spec):
  4330. """Performs pre-cross-cell resize resource claim on the dest host.
  4331. This runs on the destination host in a cross-cell resize operation
  4332. before the resize is actually started.
  4333. Performs a resize_claim for resources that are not claimed in placement
  4334. like PCI devices and NUMA topology.
  4335. Note that this is different from same-cell prep_resize in that this:
  4336. * Does not RPC cast to the source compute, that is orchestrated from
  4337. conductor.
  4338. * This does not reschedule on failure, conductor handles that since
  4339. conductor is synchronously RPC calling this method. As such, the
  4340. reverts_task_state decorator is not used on this method.
  4341. :param ctxt: user auth request context
  4342. :param instance: the instance being resized
  4343. :param flavor: the flavor being resized to (unchanged for cold migrate)
  4344. :param nodename: Name of the target compute node
  4345. :param migration: nova.objects.Migration object for the operation
  4346. :param limits: nova.objects.SchedulerLimits object of resource limits
  4347. :param request_spec: nova.objects.RequestSpec object for the operation
  4348. :returns: nova.objects.MigrationContext; the migration context created
  4349. on the destination host during the resize_claim.
  4350. :raises: nova.exception.MigrationPreCheckError if the pre-check
  4351. validation fails for the given host selection
  4352. """
  4353. LOG.debug('Checking if we can cross-cell migrate instance to this '
  4354. 'host (%s).', self.host, instance=instance)
  4355. self._send_prep_resize_notifications(
  4356. ctxt, instance, fields.NotificationPhase.START, flavor)
  4357. # TODO(mriedem): _update_pci_request_spec_with_allocated_interface_name
  4358. # should be called here if the request spec has request group mappings,
  4359. # e.g. for things like QoS ports with resource requests. Do it outside
  4360. # the try/except so if it raises BuildAbortException we do not attempt
  4361. # to reschedule.
  4362. try:
  4363. # Get the allocations within the try/except block in case we get
  4364. # an error so MigrationPreCheckError is raised up.
  4365. allocations = self.reportclient.get_allocs_for_consumer(
  4366. ctxt, instance.uuid)['allocations']
  4367. # Claim resources on this target host using the new flavor which
  4368. # will create the MigrationContext object. Note that in the future
  4369. # if we want to do other validation here we should do it within
  4370. # the MoveClaim context so we can drop the claim if anything fails.
  4371. self.rt.resize_claim(
  4372. ctxt, instance, flavor, nodename, migration, allocations,
  4373. image_meta=instance.image_meta, limits=limits)
  4374. except Exception as ex:
  4375. err = six.text_type(ex)
  4376. LOG.warning(
  4377. 'Cross-cell resize pre-checks failed for this host (%s). '
  4378. 'Cleaning up. Failure: %s', self.host, err,
  4379. instance=instance, exc_info=True)
  4380. raise exception.MigrationPreCheckError(
  4381. reason=(_("Pre-checks failed on host '%(host)s'. "
  4382. "Error: %(error)s") %
  4383. {'host': self.host, 'error': err}))
  4384. finally:
  4385. self._send_prep_resize_notifications(
  4386. ctxt, instance, fields.NotificationPhase.END, flavor)
  4387. # ResourceTracker.resize_claim() sets instance.migration_context.
  4388. return instance.migration_context
  4389. @messaging.expected_exceptions(exception.InstancePowerOffFailure)
  4390. @wrap_exception()
  4391. @reverts_task_state
  4392. @wrap_instance_event(prefix='compute')
  4393. @errors_out_migration
  4394. @wrap_instance_fault
  4395. def prep_snapshot_based_resize_at_source(
  4396. self, ctxt, instance, migration, snapshot_id=None):
  4397. """Prepares the instance at the source host for cross-cell resize
  4398. Performs actions like powering off the guest, upload snapshot data if
  4399. the instance is not volume-backed, disconnecting volumes, unplugging
  4400. VIFs and activating the destination host port bindings.
  4401. :param ctxt: user auth request context targeted at source cell
  4402. :param instance: nova.objects.Instance; the instance being resized.
  4403. The expected instance.task_state is "resize_migrating" when calling
  4404. this method, and the expected task_state upon successful completion
  4405. is "resize_migrated".
  4406. :param migration: nova.objects.Migration object for the operation.
  4407. The expected migration.status is "pre-migrating" when calling this
  4408. method and the expected status upon successful completion is
  4409. "post-migrating".
  4410. :param snapshot_id: ID of the image snapshot to upload if not a
  4411. volume-backed instance
  4412. :raises: nova.exception.InstancePowerOffFailure if stopping the
  4413. instance fails
  4414. """
  4415. # Note that if anything fails here, the migration-based allocations
  4416. # created in conductor should be reverted by conductor as well,
  4417. # see MigrationTask.rollback.
  4418. self._prep_snapshot_based_resize_at_source(
  4419. ctxt, instance, migration, snapshot_id=snapshot_id)
  4420. @delete_image_on_error
  4421. def _snapshot_for_resize(self, ctxt, image_id, instance):
  4422. """Uploads snapshot for the instance during a snapshot-based resize
  4423. If the snapshot operation fails the image will be deleted.
  4424. :param ctxt: the nova auth request context for the resize operation
  4425. :param image_id: the snapshot image ID
  4426. :param instance: the instance to snapshot/resize
  4427. """
  4428. LOG.debug('Uploading snapshot data for image %s', image_id,
  4429. instance=instance)
  4430. # Note that we do not track the snapshot phase task states
  4431. # during resize since we do not want to reflect those into the
  4432. # actual instance.task_state.
  4433. update_task_state = lambda *args, **kwargs: None
  4434. with timeutils.StopWatch() as timer:
  4435. self.driver.snapshot(ctxt, instance, image_id, update_task_state)
  4436. LOG.debug('Took %0.2f seconds to snapshot the instance on '
  4437. 'the hypervisor.', timer.elapsed(), instance=instance)
  4438. def _prep_snapshot_based_resize_at_source(
  4439. self, ctxt, instance, migration, snapshot_id=None):
  4440. """Private method for prep_snapshot_based_resize_at_source so calling
  4441. code can handle errors and perform rollbacks as necessary.
  4442. """
  4443. # Fetch and update the instance.info_cache.
  4444. network_info = self.network_api.get_instance_nw_info(ctxt, instance)
  4445. # Get the BDMs attached to this instance on this source host.
  4446. bdms = instance.get_bdms()
  4447. # Send the resize.start notification.
  4448. self._send_resize_instance_notifications(
  4449. ctxt, instance, bdms, network_info, fields.NotificationPhase.START)
  4450. # Update the migration status from "pre-migrating" to "migrating".
  4451. migration.status = 'migrating'
  4452. migration.save()
  4453. # Since the instance is going to be left on the source host during the
  4454. # resize, we need to power it off so we do not have the instance
  4455. # potentially running in two places.
  4456. LOG.debug('Stopping instance', instance=instance)
  4457. try:
  4458. self._power_off_instance(ctxt, instance)
  4459. except Exception as e:
  4460. LOG.exception('Failed to power off instance.', instance=instance)
  4461. raise exception.InstancePowerOffFailure(reason=six.text_type(e))
  4462. instance.power_state = self._get_power_state(ctxt, instance)
  4463. # If a snapshot image ID was provided, we need to snapshot the guest
  4464. # disk image and upload it to the image service.
  4465. if snapshot_id:
  4466. self._snapshot_for_resize(ctxt, snapshot_id, instance)
  4467. block_device_info = self._get_instance_block_device_info(
  4468. ctxt, instance, bdms=bdms)
  4469. # If something fails at this point the instance must go to ERROR
  4470. # status for operator intervention or to reboot/rebuild the instance.
  4471. with self._error_out_instance_on_exception(
  4472. ctxt, instance, instance_state=vm_states.ERROR):
  4473. # Destroy the guest on the source host which will disconnect
  4474. # volumes and unplug VIFs. Note that we DO NOT destroy disks since
  4475. # we want to leave those on the source host in case of a later
  4476. # failure and disks are needed to recover the guest or in case the
  4477. # resize is reverted.
  4478. LOG.debug('Destroying guest on source host but retaining disks.',
  4479. instance=instance)
  4480. self.driver.destroy(
  4481. ctxt, instance, network_info,
  4482. block_device_info=block_device_info, destroy_disks=False)
  4483. # At this point the volumes are disconnected from this source host.
  4484. # Delete the old volume attachment records and create new empty
  4485. # ones which will be used later if the resize is reverted.
  4486. LOG.debug('Deleting volume attachments for the source host.',
  4487. instance=instance)
  4488. self._terminate_volume_connections(ctxt, instance, bdms)
  4489. # At this point the VIFs are unplugged from this source host.
  4490. # Activate the dest host port bindings created by conductor.
  4491. self.network_api.migrate_instance_start(ctxt, instance, migration)
  4492. # Update the migration status from "migrating" to "post-migrating".
  4493. migration.status = 'post-migrating'
  4494. migration.save()
  4495. # At this point, the traditional resize_instance would update the
  4496. # instance host/node values to point at the dest host/node because
  4497. # that is where the disk is transferred during resize_instance, but
  4498. # with cross-cell resize the instance is not yet at the dest host
  4499. # so we do not make that update here.
  4500. instance.task_state = task_states.RESIZE_MIGRATED
  4501. instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
  4502. self._send_resize_instance_notifications(
  4503. ctxt, instance, bdms, network_info,
  4504. fields.NotificationPhase.END)
  4505. self.instance_events.clear_events_for_instance(instance)
  4506. @wrap_exception()
  4507. @reverts_task_state
  4508. @wrap_instance_event(prefix='compute')
  4509. @wrap_instance_fault
  4510. def resize_instance(self, context, instance, image,
  4511. migration, instance_type, clean_shutdown,
  4512. request_spec=None):
  4513. """Starts the migration of a running instance to another host.
  4514. This is initiated from the destination host's ``prep_resize`` routine
  4515. and runs on the source host.
  4516. """
  4517. try:
  4518. self._resize_instance(context, instance, image, migration,
  4519. instance_type, clean_shutdown, request_spec)
  4520. except Exception:
  4521. with excutils.save_and_reraise_exception():
  4522. self._revert_allocation(context, instance, migration)
  4523. def _resize_instance(self, context, instance, image,
  4524. migration, instance_type, clean_shutdown,
  4525. request_spec):
  4526. with self._error_out_instance_on_exception(context, instance), \
  4527. errors_out_migration_ctxt(migration):
  4528. network_info = self.network_api.get_instance_nw_info(context,
  4529. instance)
  4530. migration.status = 'migrating'
  4531. migration.save()
  4532. instance.task_state = task_states.RESIZE_MIGRATING
  4533. instance.save(expected_task_state=task_states.RESIZE_PREP)
  4534. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4535. context, instance.uuid)
  4536. self._send_resize_instance_notifications(
  4537. context, instance, bdms, network_info,
  4538. fields.NotificationPhase.START)
  4539. block_device_info = self._get_instance_block_device_info(
  4540. context, instance, bdms=bdms)
  4541. timeout, retry_interval = self._get_power_off_values(context,
  4542. instance, clean_shutdown)
  4543. disk_info = self.driver.migrate_disk_and_power_off(
  4544. context, instance, migration.dest_host,
  4545. instance_type, network_info,
  4546. block_device_info,
  4547. timeout, retry_interval)
  4548. self._terminate_volume_connections(context, instance, bdms)
  4549. self.network_api.migrate_instance_start(context,
  4550. instance,
  4551. migration)
  4552. migration.status = 'post-migrating'
  4553. migration.save()
  4554. instance.host = migration.dest_compute
  4555. instance.node = migration.dest_node
  4556. instance.task_state = task_states.RESIZE_MIGRATED
  4557. instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
  4558. # RPC cast to the destination host to finish the resize/migration.
  4559. self.compute_rpcapi.finish_resize(context, instance,
  4560. migration, image, disk_info, migration.dest_compute,
  4561. request_spec)
  4562. self._send_resize_instance_notifications(
  4563. context, instance, bdms, network_info,
  4564. fields.NotificationPhase.END)
  4565. self.instance_events.clear_events_for_instance(instance)
  4566. def _send_resize_instance_notifications(
  4567. self, context, instance, bdms, network_info, phase):
  4568. """Send "resize.(start|end)" notifications.
  4569. :param context: nova auth request context
  4570. :param instance: The instance being resized
  4571. :param bdms: BlockDeviceMappingList for the BDMs associated with the
  4572. instance
  4573. :param network_info: NetworkInfo for the instance info cache of ports
  4574. :param phase: The phase of the action (NotificationPhase enum, either
  4575. ``start`` or ``end``)
  4576. """
  4577. action = fields.NotificationAction.RESIZE
  4578. # Send the legacy unversioned notification.
  4579. self._notify_about_instance_usage(
  4580. context, instance, "%s.%s" % (action, phase),
  4581. network_info=network_info)
  4582. # Send the versioned notification.
  4583. compute_utils.notify_about_instance_action(
  4584. context, instance, self.host, action=action, phase=phase,
  4585. bdms=bdms)
  4586. def _terminate_volume_connections(self, context, instance, bdms):
  4587. connector = None
  4588. for bdm in bdms:
  4589. if bdm.is_volume:
  4590. if bdm.attachment_id:
  4591. # NOTE(jdg): So here's the thing, the idea behind the new
  4592. # attach API's was to have a new code fork/path that we
  4593. # followed, we're not going to do that so we have to do
  4594. # some extra work in here to make it *behave* just like the
  4595. # old code. Cinder doesn't allow disconnect/reconnect (you
  4596. # just delete the attachment and get a new one)
  4597. # attachments in the new attach code so we have to do
  4598. # a delete and create without a connector (reserve),
  4599. # in other words, beware
  4600. attachment_id = self.volume_api.attachment_create(
  4601. context, bdm.volume_id, instance.uuid)['id']
  4602. self.volume_api.attachment_delete(context,
  4603. bdm.attachment_id)
  4604. bdm.attachment_id = attachment_id
  4605. bdm.save()
  4606. else:
  4607. if connector is None:
  4608. connector = self.driver.get_volume_connector(instance)
  4609. self.volume_api.terminate_connection(context,
  4610. bdm.volume_id,
  4611. connector)
  4612. @staticmethod
  4613. def _set_instance_info(instance, instance_type):
  4614. instance.instance_type_id = instance_type.id
  4615. instance.memory_mb = instance_type.memory_mb
  4616. instance.vcpus = instance_type.vcpus
  4617. instance.root_gb = instance_type.root_gb
  4618. instance.ephemeral_gb = instance_type.ephemeral_gb
  4619. instance.flavor = instance_type
  4620. def _update_volume_attachments(self, context, instance, bdms):
  4621. """Updates volume attachments using the virt driver host connector.
  4622. :param context: nova.context.RequestContext - user request context
  4623. :param instance: nova.objects.Instance
  4624. :param bdms: nova.objects.BlockDeviceMappingList - the list of block
  4625. device mappings for the given instance
  4626. """
  4627. if bdms:
  4628. connector = None
  4629. for bdm in bdms:
  4630. if bdm.is_volume and bdm.attachment_id:
  4631. if connector is None:
  4632. connector = self.driver.get_volume_connector(instance)
  4633. self.volume_api.attachment_update(
  4634. context, bdm.attachment_id, connector, bdm.device_name)
  4635. def _complete_volume_attachments(self, context, bdms):
  4636. """Completes volume attachments for the instance
  4637. :param context: nova.context.RequestContext - user request context
  4638. :param bdms: nova.objects.BlockDeviceMappingList - the list of block
  4639. device mappings for the given instance
  4640. """
  4641. if bdms:
  4642. for bdm in bdms:
  4643. if bdm.is_volume and bdm.attachment_id:
  4644. self.volume_api.attachment_complete(
  4645. context, bdm.attachment_id)
  4646. def _finish_resize(self, context, instance, migration, disk_info,
  4647. image_meta, bdms, request_spec):
  4648. resize_instance = False # indicates disks have been resized
  4649. old_instance_type_id = migration['old_instance_type_id']
  4650. new_instance_type_id = migration['new_instance_type_id']
  4651. old_flavor = instance.flavor # the current flavor is now old
  4652. # NOTE(mriedem): Get the old_vm_state so we know if we should
  4653. # power on the instance. If old_vm_state is not set we need to default
  4654. # to ACTIVE for backwards compatibility
  4655. old_vm_state = instance.system_metadata.get('old_vm_state',
  4656. vm_states.ACTIVE)
  4657. instance.old_flavor = old_flavor
  4658. if old_instance_type_id != new_instance_type_id:
  4659. new_flavor = instance.new_flavor # this is set in _prep_resize
  4660. # Set the flavor-related fields on the instance object including
  4661. # making instance.flavor = new_flavor.
  4662. self._set_instance_info(instance, new_flavor)
  4663. for key in ('root_gb', 'swap', 'ephemeral_gb'):
  4664. if old_flavor[key] != new_flavor[key]:
  4665. resize_instance = True
  4666. break
  4667. instance.apply_migration_context()
  4668. # NOTE(tr3buchet): setup networks on destination host
  4669. self.network_api.setup_networks_on_host(context, instance,
  4670. migration.dest_compute)
  4671. provider_mappings = self._get_request_group_mapping(request_spec)
  4672. # For neutron, migrate_instance_finish updates port bindings for this
  4673. # host including any PCI devices claimed for SR-IOV ports.
  4674. self.network_api.migrate_instance_finish(
  4675. context, instance, migration, provider_mappings)
  4676. network_info = self.network_api.get_instance_nw_info(context, instance)
  4677. instance.task_state = task_states.RESIZE_FINISH
  4678. instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
  4679. self._send_finish_resize_notifications(
  4680. context, instance, bdms, network_info,
  4681. fields.NotificationPhase.START)
  4682. # We need to update any volume attachments using the destination
  4683. # host connector so that we can update the BDM.connection_info
  4684. # before calling driver.finish_migration otherwise the driver
  4685. # won't know how to connect the volumes to this host.
  4686. # Note that _get_instance_block_device_info with
  4687. # refresh_conn_info=True will update the BDM.connection_info value
  4688. # in the database so we must do this before calling that method.
  4689. self._update_volume_attachments(context, instance, bdms)
  4690. block_device_info = self._get_instance_block_device_info(
  4691. context, instance, refresh_conn_info=True, bdms=bdms)
  4692. # NOTE(mriedem): If the original vm_state was STOPPED, we don't
  4693. # automatically power on the instance after it's migrated
  4694. power_on = old_vm_state != vm_states.STOPPED
  4695. try:
  4696. self.driver.finish_migration(context, migration, instance,
  4697. disk_info,
  4698. network_info,
  4699. image_meta, resize_instance,
  4700. block_device_info, power_on)
  4701. except Exception:
  4702. # Note that we do not rollback port bindings to the source host
  4703. # because resize_instance (on the source host) updated the
  4704. # instance.host to point to *this* host (the destination host)
  4705. # so the port bindings pointing at this host are correct even
  4706. # though we failed to create the guest.
  4707. with excutils.save_and_reraise_exception():
  4708. # If we failed to create the guest on this host, reset the
  4709. # instance flavor-related fields to the old flavor. An
  4710. # error handler like reverts_task_state will save the changes.
  4711. if old_instance_type_id != new_instance_type_id:
  4712. self._set_instance_info(instance, old_flavor)
  4713. # Now complete any volume attachments that were previously updated.
  4714. self._complete_volume_attachments(context, bdms)
  4715. migration.status = 'finished'
  4716. migration.save()
  4717. instance.vm_state = vm_states.RESIZED
  4718. instance.task_state = None
  4719. instance.launched_at = timeutils.utcnow()
  4720. instance.save(expected_task_state=task_states.RESIZE_FINISH)
  4721. return network_info
  4722. @wrap_exception()
  4723. @reverts_task_state
  4724. @wrap_instance_event(prefix='compute')
  4725. @errors_out_migration
  4726. @wrap_instance_fault
  4727. def finish_resize(self, context, disk_info, image, instance,
  4728. migration, request_spec=None):
  4729. """Completes the migration process.
  4730. Sets up the newly transferred disk and turns on the instance at its
  4731. new host machine.
  4732. """
  4733. try:
  4734. self._finish_resize_helper(context, disk_info, image, instance,
  4735. migration, request_spec)
  4736. except Exception:
  4737. with excutils.save_and_reraise_exception():
  4738. # At this point, resize_instance (which runs on the source) has
  4739. # already updated the instance host/node values to point to
  4740. # this (the dest) compute, so we need to leave the allocations
  4741. # against the dest node resource provider intact and drop the
  4742. # allocations against the source node resource provider. If the
  4743. # user tries to recover the server by hard rebooting it, it
  4744. # will happen on this host so that's where the allocations
  4745. # should go. Note that this is the same method called from
  4746. # confirm_resize to cleanup the source node allocations held
  4747. # by the migration record.
  4748. LOG.info('Deleting allocations for old flavor on source node '
  4749. '%s after finish_resize failure. You may be able to '
  4750. 'recover the instance by hard rebooting it.',
  4751. migration.source_compute, instance=instance)
  4752. self._delete_allocation_after_move(
  4753. context, instance, migration)
  4754. def _finish_resize_helper(self, context, disk_info, image, instance,
  4755. migration, request_spec):
  4756. """Completes the migration process.
  4757. The caller must revert the instance's allocations if the migration
  4758. process failed.
  4759. """
  4760. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4761. context, instance.uuid)
  4762. with self._error_out_instance_on_exception(context, instance):
  4763. image_meta = objects.ImageMeta.from_dict(image)
  4764. network_info = self._finish_resize(context, instance, migration,
  4765. disk_info, image_meta, bdms,
  4766. request_spec)
  4767. # TODO(melwitt): We should clean up instance console tokens here. The
  4768. # instance is on a new host and will need to establish a new console
  4769. # connection.
  4770. self._update_scheduler_instance_info(context, instance)
  4771. self._send_finish_resize_notifications(
  4772. context, instance, bdms, network_info,
  4773. fields.NotificationPhase.END)
  4774. def _send_finish_resize_notifications(
  4775. self, context, instance, bdms, network_info, phase):
  4776. """Send notifications for the finish_resize flow.
  4777. :param context: nova auth request context
  4778. :param instance: The instance being resized
  4779. :param bdms: BlockDeviceMappingList for the BDMs associated with the
  4780. instance
  4781. :param network_info: NetworkInfo for the instance info cache of ports
  4782. :param phase: The phase of the action (NotificationPhase enum, either
  4783. ``start`` or ``end``)
  4784. """
  4785. # Send the legacy unversioned notification.
  4786. self._notify_about_instance_usage(
  4787. context, instance, "finish_resize.%s" % phase,
  4788. network_info=network_info)
  4789. # Send the versioned notification.
  4790. compute_utils.notify_about_instance_action(
  4791. context, instance, self.host,
  4792. action=fields.NotificationAction.RESIZE_FINISH, phase=phase,
  4793. bdms=bdms)
  4794. @wrap_exception()
  4795. @reverts_task_state
  4796. @wrap_instance_event(prefix='compute')
  4797. @errors_out_migration
  4798. @wrap_instance_fault
  4799. def finish_snapshot_based_resize_at_dest(
  4800. self, ctxt, instance, migration, snapshot_id, request_spec):
  4801. """Finishes the snapshot-based resize at the destination compute.
  4802. Sets up block devices and networking on the destination compute and
  4803. spawns the guest.
  4804. :param ctxt: nova auth request context targeted at the target cell DB
  4805. :param instance: The Instance object being resized with the
  4806. ``migration_context`` field set. Upon successful completion of this
  4807. method the vm_state should be "resized", the task_state should be
  4808. None, and migration context, host/node and flavor-related fields
  4809. should be set on the instance.
  4810. :param migration: The Migration object for this resize operation. Upon
  4811. successful completion of this method the migration status should
  4812. be "finished".
  4813. :param snapshot_id: ID of the image snapshot created for a
  4814. non-volume-backed instance, else None.
  4815. :param request_spec: nova.objects.RequestSpec object for the operation
  4816. """
  4817. LOG.info('Finishing snapshot based resize on destination host %s.',
  4818. self.host, instance=instance)
  4819. with self._error_out_instance_on_exception(ctxt, instance):
  4820. # Note that if anything fails here, the migration-based allocations
  4821. # created in conductor should be reverted by conductor as well,
  4822. # see MigrationTask.rollback.
  4823. self._finish_snapshot_based_resize_at_dest(
  4824. ctxt, instance, migration, snapshot_id)
  4825. def _finish_snapshot_based_resize_at_dest(
  4826. self, ctxt, instance, migration, snapshot_id):
  4827. """Private variant of finish_snapshot_based_resize_at_dest so the
  4828. caller can handle reverting resource allocations on failure and perform
  4829. other generic error handling.
  4830. """
  4831. # Figure out the image metadata to use when spawning the guest.
  4832. if snapshot_id:
  4833. image_meta = objects.ImageMeta.from_image_ref(
  4834. ctxt, self.image_api, snapshot_id)
  4835. else:
  4836. # Just use what is already on the volume-backed instance.
  4837. image_meta = instance.image_meta
  4838. resize = migration.migration_type == 'resize'
  4839. instance.old_flavor = instance.flavor
  4840. if resize:
  4841. flavor = instance.new_flavor
  4842. # If we are resizing to a new flavor we need to set the
  4843. # flavor-related fields on the instance.
  4844. # NOTE(mriedem): This is likely where storing old/new_flavor on
  4845. # the MigrationContext would make this cleaner.
  4846. self._set_instance_info(instance, flavor)
  4847. instance.apply_migration_context()
  4848. instance.task_state = task_states.RESIZE_FINISH
  4849. instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
  4850. # This seems a bit late to be sending the start notification but
  4851. # it is what traditional resize has always done as well and it does
  4852. # contain the changes to the instance with the new_flavor and
  4853. # task_state.
  4854. bdms = instance.get_bdms()
  4855. network_info = instance.get_network_info()
  4856. self._send_finish_resize_notifications(
  4857. ctxt, instance, bdms, network_info,
  4858. fields.NotificationPhase.START)
  4859. # Setup volumes and networking and spawn the guest in the hypervisor.
  4860. self._finish_snapshot_based_resize_at_dest_spawn(
  4861. ctxt, instance, migration, image_meta, bdms)
  4862. # If we spawned from a temporary snapshot image we can delete that now,
  4863. # similar to how unshelve works.
  4864. if snapshot_id:
  4865. # FIXME(mriedem): Need to deal with bug 1653953 for libvirt with
  4866. # the rbd image backend. I think the cleanest thing we can do is
  4867. # from the driver check to see if instance.migration_context is not
  4868. # None and if so, get the Migration record for that context
  4869. # (instance.migration_context.migration_id) and from that check the
  4870. # Migration.cross_cell_move flag and if True, then flatten the
  4871. # image.
  4872. compute_utils.delete_image(
  4873. ctxt, instance, self.image_api, snapshot_id)
  4874. migration.status = 'finished'
  4875. migration.save()
  4876. self._update_instance_after_spawn(
  4877. ctxt, instance, vm_state=vm_states.RESIZED)
  4878. # Setting the host/node values will make the ResourceTracker continue
  4879. # to track usage for this instance on this host.
  4880. instance.host = migration.dest_compute
  4881. instance.node = migration.dest_node
  4882. instance.save(expected_task_state=task_states.RESIZE_FINISH)
  4883. # Broadcast to all schedulers that the instance is on this host.
  4884. self._update_scheduler_instance_info(ctxt, instance)
  4885. self._send_finish_resize_notifications(
  4886. ctxt, instance, bdms, network_info,
  4887. fields.NotificationPhase.END)
  4888. def _finish_snapshot_based_resize_at_dest_spawn(
  4889. self, ctxt, instance, migration, image_meta, bdms):
  4890. """Sets up volumes and networking and spawns the guest on the dest host
  4891. If the instance was stopped when the resize was initiated the guest
  4892. will be created but remain in a shutdown power state.
  4893. If the spawn fails, port bindings are rolled back to the source host
  4894. and volume connections are terminated for this dest host.
  4895. :param ctxt: nova auth request context
  4896. :param instance: Instance object being migrated
  4897. :param migration: Migration object for the operation
  4898. :param image_meta: ImageMeta object used during driver.spawn
  4899. :param bdms: BlockDeviceMappingList of BDMs for the instance
  4900. """
  4901. # Update the volume attachments using this host's connector.
  4902. # That will update the BlockDeviceMapping.connection_info which
  4903. # will be used to connect the volumes on this host during spawn().
  4904. block_device_info = self._prep_block_device(ctxt, instance, bdms)
  4905. allocations = self.reportclient.get_allocations_for_consumer(
  4906. ctxt, instance.uuid)
  4907. # We do not call self.network_api.setup_networks_on_host here because
  4908. # for neutron that sets up the port migration profile which is only
  4909. # used during live migration with DVR. Yes it is gross knowing what
  4910. # that method does internally. We could change this when bug 1814837
  4911. # is fixed if setup_networks_on_host is made smarter by passing the
  4912. # migration record and the method checks the migration_type.
  4913. # Activate the port bindings for this host.
  4914. # FIXME(mriedem): We're going to have the same issue as bug 1813789
  4915. # here because this will update the port bindings and send the
  4916. # network-vif-plugged event and that means when driver.spawn waits for
  4917. # it we might have already gotten the event and neutron won't send
  4918. # another one so we could timeout.
  4919. # TODO(mriedem): Calculate provider mappings when we support cross-cell
  4920. # resize/migrate with ports having resource requests.
  4921. self.network_api.migrate_instance_finish(
  4922. ctxt, instance, migration, provider_mappings=None)
  4923. network_info = self.network_api.get_instance_nw_info(ctxt, instance)
  4924. # If the original vm_state was STOPPED, we do not automatically
  4925. # power on the instance after it is migrated.
  4926. power_on = instance.system_metadata['old_vm_state'] == vm_states.ACTIVE
  4927. try:
  4928. # NOTE(mriedem): If this instance uses a config drive, it will get
  4929. # rebuilt here which means any personality files will be lost,
  4930. # similar to unshelve. If the instance is not using a config drive
  4931. # and getting metadata from the metadata API service, personality
  4932. # files would be lost regardless of the move operation.
  4933. self.driver.spawn(
  4934. ctxt, instance, image_meta, injected_files=[],
  4935. admin_password=None, allocations=allocations,
  4936. network_info=network_info, block_device_info=block_device_info,
  4937. power_on=power_on)
  4938. except Exception:
  4939. with excutils.save_and_reraise_exception(logger=LOG):
  4940. # Rollback port bindings to the source host.
  4941. try:
  4942. # This is gross but migrate_instance_start looks at the
  4943. # migration.dest_compute to determine where to activate the
  4944. # port bindings and we want the source compute port
  4945. # bindings to be re-activated. Remember at this point the
  4946. # instance.host is still pointing at the source compute.
  4947. # TODO(mriedem): Maybe we should be calling
  4948. # setup_instance_network_on_host here to deal with pci
  4949. # devices?
  4950. with utils.temporary_mutation(
  4951. migration, dest_compute=migration.source_compute):
  4952. self.network_api.migrate_instance_start(
  4953. ctxt, instance, migration)
  4954. except Exception:
  4955. LOG.exception(
  4956. 'Failed to activate port bindings on the source '
  4957. 'host: %s', migration.source_compute,
  4958. instance=instance)
  4959. # Rollback volume connections on this host.
  4960. for bdm in bdms:
  4961. if bdm.is_volume:
  4962. try:
  4963. self._remove_volume_connection(
  4964. ctxt, bdm, instance, delete_attachment=True)
  4965. except Exception:
  4966. LOG.exception('Failed to remove volume connection '
  4967. 'on this host %s for volume %s.',
  4968. self.host, bdm.volume_id,
  4969. instance=instance)
  4970. @wrap_exception()
  4971. @wrap_instance_fault
  4972. def add_fixed_ip_to_instance(self, context, network_id, instance):
  4973. """Calls network_api to add new fixed_ip to instance
  4974. then injects the new network info and resets instance networking.
  4975. """
  4976. self._notify_about_instance_usage(
  4977. context, instance, "create_ip.start")
  4978. network_info = self.network_api.add_fixed_ip_to_instance(context,
  4979. instance,
  4980. network_id)
  4981. self._inject_network_info(context, instance, network_info)
  4982. self.reset_network(context, instance)
  4983. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  4984. instance.updated_at = timeutils.utcnow()
  4985. instance.save()
  4986. self._notify_about_instance_usage(
  4987. context, instance, "create_ip.end", network_info=network_info)
  4988. @wrap_exception()
  4989. @wrap_instance_fault
  4990. def remove_fixed_ip_from_instance(self, context, address, instance):
  4991. """Calls network_api to remove existing fixed_ip from instance
  4992. by injecting the altered network info and resetting
  4993. instance networking.
  4994. """
  4995. self._notify_about_instance_usage(
  4996. context, instance, "delete_ip.start")
  4997. network_info = self.network_api.remove_fixed_ip_from_instance(context,
  4998. instance,
  4999. address)
  5000. self._inject_network_info(context, instance, network_info)
  5001. self.reset_network(context, instance)
  5002. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  5003. instance.updated_at = timeutils.utcnow()
  5004. instance.save()
  5005. self._notify_about_instance_usage(
  5006. context, instance, "delete_ip.end", network_info=network_info)
  5007. @wrap_exception()
  5008. @reverts_task_state
  5009. @wrap_instance_event(prefix='compute')
  5010. @wrap_instance_fault
  5011. def pause_instance(self, context, instance):
  5012. """Pause an instance on this host."""
  5013. context = context.elevated()
  5014. LOG.info('Pausing', instance=instance)
  5015. self._notify_about_instance_usage(context, instance, 'pause.start')
  5016. compute_utils.notify_about_instance_action(context, instance,
  5017. self.host, action=fields.NotificationAction.PAUSE,
  5018. phase=fields.NotificationPhase.START)
  5019. self.driver.pause(instance)
  5020. instance.power_state = self._get_power_state(context, instance)
  5021. instance.vm_state = vm_states.PAUSED
  5022. instance.task_state = None
  5023. instance.save(expected_task_state=task_states.PAUSING)
  5024. self._notify_about_instance_usage(context, instance, 'pause.end')
  5025. compute_utils.notify_about_instance_action(context, instance,
  5026. self.host, action=fields.NotificationAction.PAUSE,
  5027. phase=fields.NotificationPhase.END)
  5028. @wrap_exception()
  5029. @reverts_task_state
  5030. @wrap_instance_event(prefix='compute')
  5031. @wrap_instance_fault
  5032. def unpause_instance(self, context, instance):
  5033. """Unpause a paused instance on this host."""
  5034. context = context.elevated()
  5035. LOG.info('Unpausing', instance=instance)
  5036. self._notify_about_instance_usage(context, instance, 'unpause.start')
  5037. compute_utils.notify_about_instance_action(context, instance,
  5038. self.host, action=fields.NotificationAction.UNPAUSE,
  5039. phase=fields.NotificationPhase.START)
  5040. self.driver.unpause(instance)
  5041. instance.power_state = self._get_power_state(context, instance)
  5042. instance.vm_state = vm_states.ACTIVE
  5043. instance.task_state = None
  5044. instance.save(expected_task_state=task_states.UNPAUSING)
  5045. self._notify_about_instance_usage(context, instance, 'unpause.end')
  5046. compute_utils.notify_about_instance_action(context, instance,
  5047. self.host, action=fields.NotificationAction.UNPAUSE,
  5048. phase=fields.NotificationPhase.END)
  5049. @wrap_exception()
  5050. def host_power_action(self, context, action):
  5051. """Reboots, shuts down or powers up the host."""
  5052. return self.driver.host_power_action(action)
  5053. @wrap_exception()
  5054. def host_maintenance_mode(self, context, host, mode):
  5055. """Start/Stop host maintenance window. On start, it triggers
  5056. guest VMs evacuation.
  5057. """
  5058. return self.driver.host_maintenance_mode(host, mode)
  5059. def _update_compute_provider_status(self, context, enabled):
  5060. """Adds or removes the COMPUTE_STATUS_DISABLED trait for this host.
  5061. For each ComputeNode managed by this service, adds or removes the
  5062. COMPUTE_STATUS_DISABLED traits to/from the associated resource provider
  5063. in Placement.
  5064. :param context: nova auth RequestContext
  5065. :param enabled: True if the node is enabled in which case the trait
  5066. would be removed, False if the node is disabled in which case
  5067. the trait would be added.
  5068. :raises: ComputeHostNotFound if there are no compute nodes found in
  5069. the ResourceTracker for this service.
  5070. """
  5071. # Get the compute node(s) on this host. Remember that ironic can be
  5072. # managing more than one compute node.
  5073. nodes = self.rt.compute_nodes.values()
  5074. if not nodes:
  5075. raise exception.ComputeHostNotFound(host=self.host)
  5076. # For each node, we want to add (or remove) the COMPUTE_STATUS_DISABLED
  5077. # trait on the related resource provider in placement so the scheduler
  5078. # (pre-)filters the provider based on its status.
  5079. for node in nodes:
  5080. try:
  5081. self.virtapi.update_compute_provider_status(
  5082. context, node.uuid, enabled)
  5083. except (exception.ResourceProviderTraitRetrievalFailed,
  5084. exception.ResourceProviderUpdateConflict,
  5085. exception.ResourceProviderUpdateFailed,
  5086. exception.TraitRetrievalFailed) as e:
  5087. # This is best effort so just log a warning and continue.
  5088. LOG.warning('An error occurred while updating '
  5089. 'COMPUTE_STATUS_DISABLED trait on compute node '
  5090. 'resource provider %s. The trait will be '
  5091. 'synchronized when the update_available_resource '
  5092. 'periodic task runs. Error: %s',
  5093. node.uuid, e.format_message())
  5094. except Exception:
  5095. LOG.exception('An error occurred while updating '
  5096. 'COMPUTE_STATUS_DISABLED trait on compute node '
  5097. 'resource provider %s. The trait will be '
  5098. 'synchronized when the '
  5099. 'update_available_resource periodic task runs.',
  5100. node.uuid)
  5101. @wrap_exception()
  5102. def set_host_enabled(self, context, enabled):
  5103. """Sets the specified host's ability to accept new instances.
  5104. This method will add or remove the COMPUTE_STATUS_DISABLED trait
  5105. to/from the associated compute node resource provider(s) for this
  5106. compute service.
  5107. """
  5108. try:
  5109. self._update_compute_provider_status(context, enabled)
  5110. except exception.ComputeHostNotFound:
  5111. LOG.warning('Unable to add/remove trait COMPUTE_STATUS_DISABLED. '
  5112. 'No ComputeNode(s) found for host: %s', self.host)
  5113. try:
  5114. return self.driver.set_host_enabled(enabled)
  5115. except NotImplementedError:
  5116. # Only the xenapi driver implements set_host_enabled but we don't
  5117. # want NotImplementedError to get raised back to the API. We still
  5118. # need to honor the compute RPC API contract and return 'enabled'
  5119. # or 'disabled' though.
  5120. return 'enabled' if enabled else 'disabled'
  5121. @wrap_exception()
  5122. def get_host_uptime(self, context):
  5123. """Returns the result of calling "uptime" on the target host."""
  5124. return self.driver.get_host_uptime()
  5125. @wrap_exception()
  5126. @wrap_instance_fault
  5127. def get_diagnostics(self, context, instance):
  5128. """Retrieve diagnostics for an instance on this host."""
  5129. current_power_state = self._get_power_state(context, instance)
  5130. if current_power_state == power_state.RUNNING:
  5131. LOG.info("Retrieving diagnostics", instance=instance)
  5132. return self.driver.get_diagnostics(instance)
  5133. else:
  5134. raise exception.InstanceInvalidState(
  5135. attr='power state',
  5136. instance_uuid=instance.uuid,
  5137. state=power_state.STATE_MAP[instance.power_state],
  5138. method='get_diagnostics')
  5139. @wrap_exception()
  5140. @wrap_instance_fault
  5141. def get_instance_diagnostics(self, context, instance):
  5142. """Retrieve diagnostics for an instance on this host."""
  5143. current_power_state = self._get_power_state(context, instance)
  5144. if current_power_state == power_state.RUNNING:
  5145. LOG.info("Retrieving diagnostics", instance=instance)
  5146. return self.driver.get_instance_diagnostics(instance)
  5147. else:
  5148. raise exception.InstanceInvalidState(
  5149. attr='power state',
  5150. instance_uuid=instance.uuid,
  5151. state=power_state.STATE_MAP[instance.power_state],
  5152. method='get_diagnostics')
  5153. @wrap_exception()
  5154. @reverts_task_state
  5155. @wrap_instance_event(prefix='compute')
  5156. @wrap_instance_fault
  5157. def suspend_instance(self, context, instance):
  5158. """Suspend the given instance."""
  5159. context = context.elevated()
  5160. # Store the old state
  5161. instance.system_metadata['old_vm_state'] = instance.vm_state
  5162. self._notify_about_instance_usage(context, instance, 'suspend.start')
  5163. compute_utils.notify_about_instance_action(context, instance,
  5164. self.host, action=fields.NotificationAction.SUSPEND,
  5165. phase=fields.NotificationPhase.START)
  5166. with self._error_out_instance_on_exception(context, instance,
  5167. instance_state=instance.vm_state):
  5168. self.driver.suspend(context, instance)
  5169. instance.power_state = self._get_power_state(context, instance)
  5170. instance.vm_state = vm_states.SUSPENDED
  5171. instance.task_state = None
  5172. instance.save(expected_task_state=task_states.SUSPENDING)
  5173. self._notify_about_instance_usage(context, instance, 'suspend.end')
  5174. compute_utils.notify_about_instance_action(context, instance,
  5175. self.host, action=fields.NotificationAction.SUSPEND,
  5176. phase=fields.NotificationPhase.END)
  5177. @wrap_exception()
  5178. @reverts_task_state
  5179. @wrap_instance_event(prefix='compute')
  5180. @wrap_instance_fault
  5181. def resume_instance(self, context, instance):
  5182. """Resume the given suspended instance."""
  5183. context = context.elevated()
  5184. LOG.info('Resuming', instance=instance)
  5185. self._notify_about_instance_usage(context, instance, 'resume.start')
  5186. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5187. context, instance.uuid)
  5188. block_device_info = self._get_instance_block_device_info(
  5189. context, instance, bdms=bdms)
  5190. compute_utils.notify_about_instance_action(context, instance,
  5191. self.host, action=fields.NotificationAction.RESUME,
  5192. phase=fields.NotificationPhase.START, bdms=bdms)
  5193. network_info = self.network_api.get_instance_nw_info(context, instance)
  5194. with self._error_out_instance_on_exception(context, instance,
  5195. instance_state=instance.vm_state):
  5196. self.driver.resume(context, instance, network_info,
  5197. block_device_info)
  5198. instance.power_state = self._get_power_state(context, instance)
  5199. # We default to the ACTIVE state for backwards compatibility
  5200. instance.vm_state = instance.system_metadata.pop('old_vm_state',
  5201. vm_states.ACTIVE)
  5202. instance.task_state = None
  5203. instance.save(expected_task_state=task_states.RESUMING)
  5204. self._notify_about_instance_usage(context, instance, 'resume.end')
  5205. compute_utils.notify_about_instance_action(context, instance,
  5206. self.host, action=fields.NotificationAction.RESUME,
  5207. phase=fields.NotificationPhase.END, bdms=bdms)
  5208. @wrap_exception()
  5209. @reverts_task_state
  5210. @wrap_instance_event(prefix='compute')
  5211. @wrap_instance_fault
  5212. def shelve_instance(self, context, instance, image_id,
  5213. clean_shutdown):
  5214. """Shelve an instance.
  5215. This should be used when you want to take a snapshot of the instance.
  5216. It also adds system_metadata that can be used by a periodic task to
  5217. offload the shelved instance after a period of time.
  5218. :param context: request context
  5219. :param instance: an Instance object
  5220. :param image_id: an image id to snapshot to.
  5221. :param clean_shutdown: give the GuestOS a chance to stop
  5222. """
  5223. @utils.synchronized(instance.uuid)
  5224. def do_shelve_instance():
  5225. self._shelve_instance(context, instance, image_id, clean_shutdown)
  5226. do_shelve_instance()
  5227. def _shelve_instance(self, context, instance, image_id,
  5228. clean_shutdown):
  5229. LOG.info('Shelving', instance=instance)
  5230. offload = CONF.shelved_offload_time == 0
  5231. if offload:
  5232. # Get the BDMs early so we can pass them into versioned
  5233. # notifications since _shelve_offload_instance needs the
  5234. # BDMs anyway.
  5235. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5236. context, instance.uuid)
  5237. else:
  5238. bdms = None
  5239. compute_utils.notify_usage_exists(self.notifier, context, instance,
  5240. self.host, current_period=True)
  5241. self._notify_about_instance_usage(context, instance, 'shelve.start')
  5242. compute_utils.notify_about_instance_action(context, instance,
  5243. self.host, action=fields.NotificationAction.SHELVE,
  5244. phase=fields.NotificationPhase.START, bdms=bdms)
  5245. def update_task_state(task_state, expected_state=task_states.SHELVING):
  5246. shelving_state_map = {
  5247. task_states.IMAGE_PENDING_UPLOAD:
  5248. task_states.SHELVING_IMAGE_PENDING_UPLOAD,
  5249. task_states.IMAGE_UPLOADING:
  5250. task_states.SHELVING_IMAGE_UPLOADING,
  5251. task_states.SHELVING: task_states.SHELVING}
  5252. task_state = shelving_state_map[task_state]
  5253. expected_state = shelving_state_map[expected_state]
  5254. instance.task_state = task_state
  5255. instance.save(expected_task_state=expected_state)
  5256. # Do not attempt a clean shutdown of a paused guest since some
  5257. # hypervisors will fail the clean shutdown if the guest is not
  5258. # running.
  5259. if instance.power_state == power_state.PAUSED:
  5260. clean_shutdown = False
  5261. self._power_off_instance(context, instance, clean_shutdown)
  5262. self.driver.snapshot(context, instance, image_id, update_task_state)
  5263. instance.system_metadata['shelved_at'] = timeutils.utcnow().isoformat()
  5264. instance.system_metadata['shelved_image_id'] = image_id
  5265. instance.system_metadata['shelved_host'] = self.host
  5266. instance.vm_state = vm_states.SHELVED
  5267. instance.task_state = None
  5268. if CONF.shelved_offload_time == 0:
  5269. instance.task_state = task_states.SHELVING_OFFLOADING
  5270. instance.power_state = self._get_power_state(context, instance)
  5271. instance.save(expected_task_state=[
  5272. task_states.SHELVING,
  5273. task_states.SHELVING_IMAGE_UPLOADING])
  5274. self._notify_about_instance_usage(context, instance, 'shelve.end')
  5275. compute_utils.notify_about_instance_action(context, instance,
  5276. self.host, action=fields.NotificationAction.SHELVE,
  5277. phase=fields.NotificationPhase.END, bdms=bdms)
  5278. if offload:
  5279. self._shelve_offload_instance(context, instance,
  5280. clean_shutdown=False, bdms=bdms)
  5281. @wrap_exception()
  5282. @reverts_task_state
  5283. @wrap_instance_event(prefix='compute')
  5284. @wrap_instance_fault
  5285. def shelve_offload_instance(self, context, instance, clean_shutdown):
  5286. """Remove a shelved instance from the hypervisor.
  5287. This frees up those resources for use by other instances, but may lead
  5288. to slower unshelve times for this instance. This method is used by
  5289. volume backed instances since restoring them doesn't involve the
  5290. potentially large download of an image.
  5291. :param context: request context
  5292. :param instance: nova.objects.instance.Instance
  5293. :param clean_shutdown: give the GuestOS a chance to stop
  5294. """
  5295. @utils.synchronized(instance.uuid)
  5296. def do_shelve_offload_instance():
  5297. self._shelve_offload_instance(context, instance, clean_shutdown)
  5298. do_shelve_offload_instance()
  5299. def _shelve_offload_instance(self, context, instance, clean_shutdown,
  5300. bdms=None):
  5301. LOG.info('Shelve offloading', instance=instance)
  5302. if bdms is None:
  5303. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5304. context, instance.uuid)
  5305. self._notify_about_instance_usage(context, instance,
  5306. 'shelve_offload.start')
  5307. compute_utils.notify_about_instance_action(context, instance,
  5308. self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
  5309. phase=fields.NotificationPhase.START, bdms=bdms)
  5310. self._power_off_instance(context, instance, clean_shutdown)
  5311. current_power_state = self._get_power_state(context, instance)
  5312. self.network_api.cleanup_instance_network_on_host(context, instance,
  5313. instance.host)
  5314. network_info = self.network_api.get_instance_nw_info(context, instance)
  5315. block_device_info = self._get_instance_block_device_info(context,
  5316. instance,
  5317. bdms=bdms)
  5318. self.driver.destroy(context, instance, network_info,
  5319. block_device_info)
  5320. # the instance is going to be removed from the host so we want to
  5321. # terminate all the connections with the volume server and the host
  5322. self._terminate_volume_connections(context, instance, bdms)
  5323. # Free up the resource allocations in the placement service.
  5324. # This should happen *before* the vm_state is changed to
  5325. # SHELVED_OFFLOADED in case client-side code is polling the API to
  5326. # schedule more instances (or unshelve) once this server is offloaded.
  5327. self.rt.delete_allocation_for_shelve_offloaded_instance(context,
  5328. instance)
  5329. instance.power_state = current_power_state
  5330. # NOTE(mriedem): The vm_state has to be set before updating the
  5331. # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node
  5332. # values cannot be nulled out until after updating the resource tracker
  5333. # though.
  5334. instance.vm_state = vm_states.SHELVED_OFFLOADED
  5335. instance.task_state = None
  5336. instance.save(expected_task_state=[task_states.SHELVING,
  5337. task_states.SHELVING_OFFLOADING])
  5338. # NOTE(ndipanov): Free resources from the resource tracker
  5339. self._update_resource_tracker(context, instance)
  5340. # NOTE(sfinucan): RPC calls should no longer be attempted against this
  5341. # instance, so ensure any calls result in errors
  5342. self._nil_out_instance_obj_host_and_node(instance)
  5343. instance.save(expected_task_state=None)
  5344. # TODO(melwitt): We should clean up instance console tokens here. The
  5345. # instance has no host at this point and will need to establish a new
  5346. # console connection in the future after it is unshelved.
  5347. self._delete_scheduler_instance_info(context, instance.uuid)
  5348. self._notify_about_instance_usage(context, instance,
  5349. 'shelve_offload.end')
  5350. compute_utils.notify_about_instance_action(context, instance,
  5351. self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
  5352. phase=fields.NotificationPhase.END, bdms=bdms)
  5353. @wrap_exception()
  5354. @reverts_task_state
  5355. @wrap_instance_event(prefix='compute')
  5356. @wrap_instance_fault
  5357. def unshelve_instance(self, context, instance, image,
  5358. filter_properties, node, request_spec=None):
  5359. """Unshelve the instance.
  5360. :param context: request context
  5361. :param instance: a nova.objects.instance.Instance object
  5362. :param image: an image to build from. If None we assume a
  5363. volume backed instance.
  5364. :param filter_properties: dict containing limits, retry info etc.
  5365. :param node: target compute node
  5366. :param request_spec: the RequestSpec object used to schedule the
  5367. instance
  5368. """
  5369. if filter_properties is None:
  5370. filter_properties = {}
  5371. @utils.synchronized(instance.uuid)
  5372. def do_unshelve_instance():
  5373. self._unshelve_instance(context, instance, image,
  5374. filter_properties, node)
  5375. do_unshelve_instance()
  5376. def _unshelve_instance_key_scrub(self, instance):
  5377. """Remove data from the instance that may cause side effects."""
  5378. cleaned_keys = dict(
  5379. key_data=instance.key_data,
  5380. auto_disk_config=instance.auto_disk_config)
  5381. instance.key_data = None
  5382. instance.auto_disk_config = False
  5383. return cleaned_keys
  5384. def _unshelve_instance_key_restore(self, instance, keys):
  5385. """Restore previously scrubbed keys before saving the instance."""
  5386. instance.update(keys)
  5387. def _unshelve_instance(self, context, instance, image, filter_properties,
  5388. node):
  5389. LOG.info('Unshelving', instance=instance)
  5390. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5391. context, instance.uuid)
  5392. self._notify_about_instance_usage(context, instance, 'unshelve.start')
  5393. compute_utils.notify_about_instance_action(context, instance,
  5394. self.host, action=fields.NotificationAction.UNSHELVE,
  5395. phase=fields.NotificationPhase.START, bdms=bdms)
  5396. instance.task_state = task_states.SPAWNING
  5397. instance.save()
  5398. block_device_info = self._prep_block_device(context, instance, bdms)
  5399. scrubbed_keys = self._unshelve_instance_key_scrub(instance)
  5400. if node is None:
  5401. node = self._get_nodename(instance)
  5402. limits = filter_properties.get('limits', {})
  5403. allocations = self.reportclient.get_allocations_for_consumer(
  5404. context, instance.uuid)
  5405. shelved_image_ref = instance.image_ref
  5406. if image:
  5407. instance.image_ref = image['id']
  5408. image_meta = objects.ImageMeta.from_dict(image)
  5409. else:
  5410. image_meta = objects.ImageMeta.from_dict(
  5411. utils.get_image_from_system_metadata(
  5412. instance.system_metadata))
  5413. self.network_api.setup_instance_network_on_host(context, instance,
  5414. self.host)
  5415. network_info = self.network_api.get_instance_nw_info(context, instance)
  5416. try:
  5417. with self.rt.instance_claim(context, instance, node, allocations,
  5418. limits):
  5419. self.driver.spawn(context, instance, image_meta,
  5420. injected_files=[],
  5421. admin_password=None,
  5422. allocations=allocations,
  5423. network_info=network_info,
  5424. block_device_info=block_device_info)
  5425. except Exception:
  5426. with excutils.save_and_reraise_exception(logger=LOG):
  5427. LOG.exception('Instance failed to spawn',
  5428. instance=instance)
  5429. # Cleanup allocations created by the scheduler on this host
  5430. # since we failed to spawn the instance. We do this both if
  5431. # the instance claim failed with ComputeResourcesUnavailable
  5432. # or if we did claim but the spawn failed, because aborting the
  5433. # instance claim will not remove the allocations.
  5434. self.reportclient.delete_allocation_for_instance(context,
  5435. instance.uuid)
  5436. # FIXME: Umm, shouldn't we be rolling back port bindings too?
  5437. self._terminate_volume_connections(context, instance, bdms)
  5438. # The reverts_task_state decorator on unshelve_instance will
  5439. # eventually save these updates.
  5440. self._nil_out_instance_obj_host_and_node(instance)
  5441. if image:
  5442. instance.image_ref = shelved_image_ref
  5443. self._delete_snapshot_of_shelved_instance(context, instance,
  5444. image['id'])
  5445. self._unshelve_instance_key_restore(instance, scrubbed_keys)
  5446. self._update_instance_after_spawn(context, instance)
  5447. # Delete system_metadata for a shelved instance
  5448. compute_utils.remove_shelved_keys_from_system_metadata(instance)
  5449. instance.save(expected_task_state=task_states.SPAWNING)
  5450. self._update_scheduler_instance_info(context, instance)
  5451. self._notify_about_instance_usage(context, instance, 'unshelve.end')
  5452. compute_utils.notify_about_instance_action(context, instance,
  5453. self.host, action=fields.NotificationAction.UNSHELVE,
  5454. phase=fields.NotificationPhase.END, bdms=bdms)
  5455. @messaging.expected_exceptions(NotImplementedError)
  5456. @wrap_instance_fault
  5457. def reset_network(self, context, instance):
  5458. """Reset networking on the given instance."""
  5459. LOG.debug('Reset network', instance=instance)
  5460. self.driver.reset_network(instance)
  5461. def _inject_network_info(self, context, instance, network_info):
  5462. """Inject network info for the given instance."""
  5463. LOG.debug('Inject network info', instance=instance)
  5464. LOG.debug('network_info to inject: |%s|', network_info,
  5465. instance=instance)
  5466. self.driver.inject_network_info(instance,
  5467. network_info)
  5468. @wrap_instance_fault
  5469. def inject_network_info(self, context, instance):
  5470. """Inject network info, but don't return the info."""
  5471. network_info = self.network_api.get_instance_nw_info(context, instance)
  5472. self._inject_network_info(context, instance, network_info)
  5473. @messaging.expected_exceptions(NotImplementedError,
  5474. exception.ConsoleNotAvailable,
  5475. exception.InstanceNotFound)
  5476. @wrap_exception()
  5477. @wrap_instance_fault
  5478. def get_console_output(self, context, instance, tail_length):
  5479. """Send the console output for the given instance."""
  5480. context = context.elevated()
  5481. LOG.info("Get console output", instance=instance)
  5482. output = self.driver.get_console_output(context, instance)
  5483. if type(output) is six.text_type:
  5484. output = six.b(output)
  5485. if tail_length is not None:
  5486. output = self._tail_log(output, tail_length)
  5487. return output.decode('ascii', 'replace')
  5488. def _tail_log(self, log, length):
  5489. try:
  5490. length = int(length)
  5491. except ValueError:
  5492. length = 0
  5493. if length == 0:
  5494. return b''
  5495. else:
  5496. return b'\n'.join(log.split(b'\n')[-int(length):])
  5497. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  5498. exception.InstanceNotReady,
  5499. exception.InstanceNotFound,
  5500. exception.ConsoleTypeUnavailable,
  5501. NotImplementedError)
  5502. @wrap_exception()
  5503. @wrap_instance_fault
  5504. def get_vnc_console(self, context, console_type, instance):
  5505. """Return connection information for a vnc console."""
  5506. context = context.elevated()
  5507. LOG.debug("Getting vnc console", instance=instance)
  5508. if not CONF.vnc.enabled:
  5509. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  5510. if console_type == 'novnc':
  5511. # For essex, novncproxy_base_url must include the full path
  5512. # including the html file (like http://myhost/vnc_auto.html)
  5513. access_url_base = CONF.vnc.novncproxy_base_url
  5514. elif console_type == 'xvpvnc':
  5515. access_url_base = CONF.vnc.xvpvncproxy_base_url
  5516. else:
  5517. raise exception.ConsoleTypeInvalid(console_type=console_type)
  5518. try:
  5519. # Retrieve connect info from