OpenStack Compute (Nova)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

6235 lines
277KB

  1. # Copyright 2010 United States Government as represented by the
  2. # Administrator of the National Aeronautics and Space Administration.
  3. # Copyright 2011 Justin Santa Barbara
  4. # All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  7. # not use this file except in compliance with the License. You may obtain
  8. # a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15. # License for the specific language governing permissions and limitations
  16. # under the License.
  17. """Handles all processes relating to instances (guest vms).
  18. The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
  19. handles RPC calls relating to creating instances. It is responsible for
  20. building a disk image, launching it via the underlying virtualization driver,
  21. responding to calls to check its state, attaching persistent storage, and
  22. terminating it.
  23. """
  24. import base64
  25. import contextlib
  26. import functools
  27. import socket
  28. import sys
  29. import time
  30. import traceback
  31. import uuid
  32. from cinderclient import exceptions as cinder_exception
  33. import eventlet.event
  34. from eventlet import greenthread
  35. import eventlet.timeout
  36. from oslo.config import cfg
  37. from oslo import messaging
  38. from oslo.serialization import jsonutils
  39. from oslo.utils import excutils
  40. from oslo.utils import strutils
  41. from oslo.utils import timeutils
  42. import six
  43. from nova import block_device
  44. from nova.cells import rpcapi as cells_rpcapi
  45. from nova.cloudpipe import pipelib
  46. from nova import compute
  47. from nova.compute import flavors
  48. from nova.compute import power_state
  49. from nova.compute import resource_tracker
  50. from nova.compute import rpcapi as compute_rpcapi
  51. from nova.compute import task_states
  52. from nova.compute import utils as compute_utils
  53. from nova.compute import vm_states
  54. from nova import conductor
  55. from nova import consoleauth
  56. import nova.context
  57. from nova import exception
  58. from nova import hooks
  59. from nova.i18n import _
  60. from nova.i18n import _LE
  61. from nova.i18n import _LI
  62. from nova.i18n import _LW
  63. from nova import image
  64. from nova.image import glance
  65. from nova import manager
  66. from nova import network
  67. from nova.network import model as network_model
  68. from nova.network.security_group import openstack_driver
  69. from nova import objects
  70. from nova.objects import base as obj_base
  71. from nova.objects import instance as instance_obj
  72. from nova.objects import quotas as quotas_obj
  73. from nova.openstack.common import log as logging
  74. from nova.openstack.common import periodic_task
  75. from nova import paths
  76. from nova import rpc
  77. from nova import safe_utils
  78. from nova.scheduler import rpcapi as scheduler_rpcapi
  79. from nova import utils
  80. from nova.virt import block_device as driver_block_device
  81. from nova.virt import driver
  82. from nova.virt import event as virtevent
  83. from nova.virt import storage_users
  84. from nova.virt import virtapi
  85. from nova import volume
  86. from nova.volume import encryptors
  87. compute_opts = [
  88. cfg.StrOpt('console_host',
  89. default=socket.gethostname(),
  90. help='Console proxy host to use to connect '
  91. 'to instances on this host.'),
  92. cfg.StrOpt('default_access_ip_network_name',
  93. help='Name of network to use to set access IPs for instances'),
  94. cfg.BoolOpt('defer_iptables_apply',
  95. default=False,
  96. help='Whether to batch up the application of IPTables rules'
  97. ' during a host restart and apply all at the end of the'
  98. ' init phase'),
  99. cfg.StrOpt('instances_path',
  100. default=paths.state_path_def('instances'),
  101. help='Where instances are stored on disk'),
  102. cfg.BoolOpt('instance_usage_audit',
  103. default=False,
  104. help="Generate periodic compute.instance.exists"
  105. " notifications"),
  106. cfg.IntOpt('live_migration_retry_count',
  107. default=30,
  108. help="Number of 1 second retries needed in live_migration"),
  109. cfg.BoolOpt('resume_guests_state_on_host_boot',
  110. default=False,
  111. help='Whether to start guests that were running before the '
  112. 'host rebooted'),
  113. cfg.IntOpt('network_allocate_retries',
  114. default=0,
  115. help="Number of times to retry network allocation on failures"),
  116. cfg.IntOpt('block_device_allocate_retries',
  117. default=60,
  118. help='Number of times to retry block device'
  119. ' allocation on failures')
  120. ]
  121. interval_opts = [
  122. cfg.IntOpt('bandwidth_poll_interval',
  123. default=600,
  124. help='Interval to pull network bandwidth usage info. Not '
  125. 'supported on all hypervisors. Set to -1 to disable. '
  126. 'Setting this to 0 will run at the default rate.'),
  127. cfg.IntOpt('sync_power_state_interval',
  128. default=600,
  129. help='Interval to sync power states between the database and '
  130. 'the hypervisor. Set to -1 to disable. '
  131. 'Setting this to 0 will run at the default rate.'),
  132. cfg.IntOpt("heal_instance_info_cache_interval",
  133. default=60,
  134. help="Number of seconds between instance info_cache self "
  135. "healing updates"),
  136. cfg.IntOpt('reclaim_instance_interval',
  137. default=0,
  138. help='Interval in seconds for reclaiming deleted instances'),
  139. cfg.IntOpt('volume_usage_poll_interval',
  140. default=0,
  141. help='Interval in seconds for gathering volume usages'),
  142. cfg.IntOpt('shelved_poll_interval',
  143. default=3600,
  144. help='Interval in seconds for polling shelved instances to '
  145. 'offload. Set to -1 to disable.'
  146. 'Setting this to 0 will run at the default rate.'),
  147. cfg.IntOpt('shelved_offload_time',
  148. default=0,
  149. help='Time in seconds before a shelved instance is eligible '
  150. 'for removing from a host. -1 never offload, 0 offload '
  151. 'when shelved'),
  152. cfg.IntOpt('instance_delete_interval',
  153. default=300,
  154. help=('Interval in seconds for retrying failed instance file '
  155. 'deletes. Set to -1 to disable. '
  156. 'Setting this to 0 will run at the default rate.')),
  157. cfg.IntOpt('block_device_allocate_retries_interval',
  158. default=3,
  159. help='Waiting time interval (seconds) between block'
  160. ' device allocation retries on failures')
  161. ]
  162. timeout_opts = [
  163. cfg.IntOpt("reboot_timeout",
  164. default=0,
  165. help="Automatically hard reboot an instance if it has been "
  166. "stuck in a rebooting state longer than N seconds. "
  167. "Set to 0 to disable."),
  168. cfg.IntOpt("instance_build_timeout",
  169. default=0,
  170. help="Amount of time in seconds an instance can be in BUILD "
  171. "before going into ERROR status."
  172. "Set to 0 to disable."),
  173. cfg.IntOpt("rescue_timeout",
  174. default=0,
  175. help="Automatically unrescue an instance after N seconds. "
  176. "Set to 0 to disable."),
  177. cfg.IntOpt("resize_confirm_window",
  178. default=0,
  179. help="Automatically confirm resizes after N seconds. "
  180. "Set to 0 to disable."),
  181. cfg.IntOpt("shutdown_timeout",
  182. default=60,
  183. help="Total amount of time to wait in seconds for an instance "
  184. "to perform a clean shutdown."),
  185. ]
  186. running_deleted_opts = [
  187. cfg.StrOpt("running_deleted_instance_action",
  188. default="reap",
  189. help="Action to take if a running deleted instance is detected."
  190. "Valid options are 'noop', 'log', 'shutdown', or 'reap'. "
  191. "Set to 'noop' to take no action."),
  192. cfg.IntOpt("running_deleted_instance_poll_interval",
  193. default=1800,
  194. help="Number of seconds to wait between runs of the cleanup "
  195. "task."),
  196. cfg.IntOpt("running_deleted_instance_timeout",
  197. default=0,
  198. help="Number of seconds after being deleted when a running "
  199. "instance should be considered eligible for cleanup."),
  200. ]
  201. instance_cleaning_opts = [
  202. cfg.IntOpt('maximum_instance_delete_attempts',
  203. default=5,
  204. help=('The number of times to attempt to reap an instance\'s '
  205. 'files.')),
  206. ]
  207. CONF = cfg.CONF
  208. CONF.register_opts(compute_opts)
  209. CONF.register_opts(interval_opts)
  210. CONF.register_opts(timeout_opts)
  211. CONF.register_opts(running_deleted_opts)
  212. CONF.register_opts(instance_cleaning_opts)
  213. CONF.import_opt('allow_resize_to_same_host', 'nova.compute.api')
  214. CONF.import_opt('console_topic', 'nova.console.rpcapi')
  215. CONF.import_opt('host', 'nova.netconf')
  216. CONF.import_opt('my_ip', 'nova.netconf')
  217. CONF.import_opt('vnc_enabled', 'nova.vnc')
  218. CONF.import_opt('enabled', 'nova.spice', group='spice')
  219. CONF.import_opt('enable', 'nova.cells.opts', group='cells')
  220. CONF.import_opt('image_cache_subdirectory_name', 'nova.virt.imagecache')
  221. CONF.import_opt('image_cache_manager_interval', 'nova.virt.imagecache')
  222. CONF.import_opt('enabled', 'nova.rdp', group='rdp')
  223. CONF.import_opt('html5_proxy_base_url', 'nova.rdp', group='rdp')
  224. CONF.import_opt('enabled', 'nova.console.serial', group='serial_console')
  225. CONF.import_opt('base_url', 'nova.console.serial', group='serial_console')
  226. LOG = logging.getLogger(__name__)
  227. get_notifier = functools.partial(rpc.get_notifier, service='compute')
  228. wrap_exception = functools.partial(exception.wrap_exception,
  229. get_notifier=get_notifier)
  230. @utils.expects_func_args('migration')
  231. def errors_out_migration(function):
  232. """Decorator to error out migration on failure."""
  233. @functools.wraps(function)
  234. def decorated_function(self, context, *args, **kwargs):
  235. try:
  236. return function(self, context, *args, **kwargs)
  237. except Exception:
  238. with excutils.save_and_reraise_exception():
  239. migration = kwargs['migration']
  240. status = migration.status
  241. if status not in ['migrating', 'post-migrating']:
  242. return
  243. migration.status = 'error'
  244. try:
  245. migration.save(context.elevated())
  246. except Exception:
  247. LOG.debug('Error setting migration status '
  248. 'for instance %s.',
  249. migration.instance_uuid, exc_info=True)
  250. return decorated_function
  251. @utils.expects_func_args('instance')
  252. def reverts_task_state(function):
  253. """Decorator to revert task_state on failure."""
  254. @functools.wraps(function)
  255. def decorated_function(self, context, *args, **kwargs):
  256. try:
  257. return function(self, context, *args, **kwargs)
  258. except exception.UnexpectedTaskStateError as e:
  259. # Note(maoy): unexpected task state means the current
  260. # task is preempted. Do not clear task state in this
  261. # case.
  262. with excutils.save_and_reraise_exception():
  263. LOG.info(_("Task possibly preempted: %s") % e.format_message())
  264. except Exception:
  265. with excutils.save_and_reraise_exception():
  266. try:
  267. self._instance_update(context,
  268. kwargs['instance']['uuid'],
  269. task_state=None)
  270. except Exception:
  271. pass
  272. return decorated_function
  273. @utils.expects_func_args('instance')
  274. def wrap_instance_fault(function):
  275. """Wraps a method to catch exceptions related to instances.
  276. This decorator wraps a method to catch any exceptions having to do with
  277. an instance that may get thrown. It then logs an instance fault in the db.
  278. """
  279. @functools.wraps(function)
  280. def decorated_function(self, context, *args, **kwargs):
  281. try:
  282. return function(self, context, *args, **kwargs)
  283. except exception.InstanceNotFound:
  284. raise
  285. except Exception as e:
  286. # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
  287. # we will get a KeyError exception which will cover up the real
  288. # exception. So, we update kwargs with the values from args first.
  289. # then, we can get 'instance' from kwargs easily.
  290. kwargs.update(dict(zip(function.func_code.co_varnames[2:], args)))
  291. with excutils.save_and_reraise_exception():
  292. compute_utils.add_instance_fault_from_exc(context,
  293. kwargs['instance'], e, sys.exc_info())
  294. return decorated_function
  295. @utils.expects_func_args('instance')
  296. def wrap_instance_event(function):
  297. """Wraps a method to log the event taken on the instance, and result.
  298. This decorator wraps a method to log the start and result of an event, as
  299. part of an action taken on an instance.
  300. """
  301. @functools.wraps(function)
  302. def decorated_function(self, context, *args, **kwargs):
  303. wrapped_func = utils.get_wrapped_function(function)
  304. keyed_args = safe_utils.getcallargs(wrapped_func, context, *args,
  305. **kwargs)
  306. instance_uuid = keyed_args['instance']['uuid']
  307. event_name = 'compute_{0}'.format(function.func_name)
  308. with compute_utils.EventReporter(context, event_name, instance_uuid):
  309. return function(self, context, *args, **kwargs)
  310. return decorated_function
  311. @utils.expects_func_args('image_id', 'instance')
  312. def delete_image_on_error(function):
  313. """Used for snapshot related method to ensure the image created in
  314. compute.api is deleted when an error occurs.
  315. """
  316. @functools.wraps(function)
  317. def decorated_function(self, context, image_id, instance,
  318. *args, **kwargs):
  319. try:
  320. return function(self, context, image_id, instance,
  321. *args, **kwargs)
  322. except Exception:
  323. with excutils.save_and_reraise_exception():
  324. LOG.debug("Cleaning up image %s", image_id,
  325. exc_info=True, instance=instance)
  326. try:
  327. self.image_api.delete(context, image_id)
  328. except Exception:
  329. LOG.exception(_LE("Error while trying to clean up "
  330. "image %s"), image_id,
  331. instance=instance)
  332. return decorated_function
  333. # TODO(danms): Remove me after Icehouse
  334. # NOTE(mikal): if the method being decorated has more than one decorator, then
  335. # put this one first. Otherwise the various exception handling decorators do
  336. # not function correctly.
  337. def object_compat(function):
  338. """Wraps a method that expects a new-world instance
  339. This provides compatibility for callers passing old-style dict
  340. instances.
  341. """
  342. @functools.wraps(function)
  343. def decorated_function(self, context, *args, **kwargs):
  344. def _load_instance(instance_or_dict):
  345. if isinstance(instance_or_dict, dict):
  346. instance = objects.Instance._from_db_object(
  347. context, objects.Instance(), instance_or_dict,
  348. expected_attrs=metas)
  349. instance._context = context
  350. return instance
  351. return instance_or_dict
  352. metas = ['metadata', 'system_metadata']
  353. try:
  354. kwargs['instance'] = _load_instance(kwargs['instance'])
  355. except KeyError:
  356. args = (_load_instance(args[0]),) + args[1:]
  357. migration = kwargs.get('migration')
  358. if isinstance(migration, dict):
  359. migration = objects.Migration._from_db_object(
  360. context.elevated(), objects.Migration(),
  361. migration)
  362. kwargs['migration'] = migration
  363. return function(self, context, *args, **kwargs)
  364. return decorated_function
  365. # TODO(danms): Remove me after Icehouse
  366. def aggregate_object_compat(function):
  367. """Wraps a method that expects a new-world aggregate."""
  368. @functools.wraps(function)
  369. def decorated_function(self, context, *args, **kwargs):
  370. aggregate = kwargs.get('aggregate')
  371. if isinstance(aggregate, dict):
  372. aggregate = objects.Aggregate._from_db_object(
  373. context.elevated(), objects.Aggregate(),
  374. aggregate)
  375. kwargs['aggregate'] = aggregate
  376. return function(self, context, *args, **kwargs)
  377. return decorated_function
  378. class InstanceEvents(object):
  379. def __init__(self):
  380. self._events = {}
  381. @staticmethod
  382. def _lock_name(instance):
  383. return '%s-%s' % (instance.uuid, 'events')
  384. def prepare_for_instance_event(self, instance, event_name):
  385. """Prepare to receive an event for an instance.
  386. This will register an event for the given instance that we will
  387. wait on later. This should be called before initiating whatever
  388. action will trigger the event. The resulting eventlet.event.Event
  389. object should be wait()'d on to ensure completion.
  390. :param instance: the instance for which the event will be generated
  391. :param event_name: the name of the event we're expecting
  392. :returns: an event object that should be wait()'d on
  393. """
  394. @utils.synchronized(self._lock_name(instance))
  395. def _create_or_get_event():
  396. if instance.uuid not in self._events:
  397. self._events.setdefault(instance.uuid, {})
  398. return self._events[instance.uuid].setdefault(
  399. event_name, eventlet.event.Event())
  400. LOG.debug('Preparing to wait for external event %(event)s',
  401. {'event': event_name}, instance=instance)
  402. return _create_or_get_event()
  403. def pop_instance_event(self, instance, event):
  404. """Remove a pending event from the wait list.
  405. This will remove a pending event from the wait list so that it
  406. can be used to signal the waiters to wake up.
  407. :param instance: the instance for which the event was generated
  408. :param event: the nova.objects.external_event.InstanceExternalEvent
  409. that describes the event
  410. :returns: the eventlet.event.Event object on which the waiters
  411. are blocked
  412. """
  413. no_events_sentinel = object()
  414. no_matching_event_sentinel = object()
  415. @utils.synchronized(self._lock_name(instance))
  416. def _pop_event():
  417. events = self._events.get(instance.uuid)
  418. if not events:
  419. return no_events_sentinel
  420. _event = events.pop(event.key, None)
  421. if not events:
  422. del self._events[instance.uuid]
  423. if _event is None:
  424. return no_matching_event_sentinel
  425. return _event
  426. result = _pop_event()
  427. if result == no_events_sentinel:
  428. LOG.debug('No waiting events found dispatching %(event)s',
  429. {'event': event.key},
  430. instance=instance)
  431. return None
  432. elif result == no_matching_event_sentinel:
  433. LOG.debug('No event matching %(event)s in %(events)s',
  434. {'event': event.key,
  435. 'events': self._events.get(instance.uuid, {}).keys()},
  436. instance=instance)
  437. return None
  438. else:
  439. return result
  440. def clear_events_for_instance(self, instance):
  441. """Remove all pending events for an instance.
  442. This will remove all events currently pending for an instance
  443. and return them (indexed by event name).
  444. :param instance: the instance for which events should be purged
  445. :returns: a dictionary of {event_name: eventlet.event.Event}
  446. """
  447. @utils.synchronized(self._lock_name(instance))
  448. def _clear_events():
  449. # NOTE(danms): Use getitem syntax for the instance until
  450. # all the callers are using objects
  451. return self._events.pop(instance['uuid'], {})
  452. return _clear_events()
  453. class ComputeVirtAPI(virtapi.VirtAPI):
  454. def __init__(self, compute):
  455. super(ComputeVirtAPI, self).__init__()
  456. self._compute = compute
  457. def provider_fw_rule_get_all(self, context):
  458. return self._compute.conductor_api.provider_fw_rule_get_all(context)
  459. def _default_error_callback(self, event_name, instance):
  460. raise exception.NovaException(_('Instance event failed'))
  461. @contextlib.contextmanager
  462. def wait_for_instance_event(self, instance, event_names, deadline=300,
  463. error_callback=None):
  464. """Plan to wait for some events, run some code, then wait.
  465. This context manager will first create plans to wait for the
  466. provided event_names, yield, and then wait for all the scheduled
  467. events to complete.
  468. Note that this uses an eventlet.timeout.Timeout to bound the
  469. operation, so callers should be prepared to catch that
  470. failure and handle that situation appropriately.
  471. If the event is not received by the specified timeout deadline,
  472. eventlet.timeout.Timeout is raised.
  473. If the event is received but did not have a 'completed'
  474. status, a NovaException is raised. If an error_callback is
  475. provided, instead of raising an exception as detailed above
  476. for the failure case, the callback will be called with the
  477. event_name and instance, and can return True to continue
  478. waiting for the rest of the events, False to stop processing,
  479. or raise an exception which will bubble up to the waiter.
  480. :param instance: The instance for which an event is expected
  481. :param event_names: A list of event names. Each element can be a
  482. string event name or tuple of strings to
  483. indicate (name, tag).
  484. :param deadline: Maximum number of seconds we should wait for all
  485. of the specified events to arrive.
  486. :param error_callback: A function to be called if an event arrives
  487. """
  488. if error_callback is None:
  489. error_callback = self._default_error_callback
  490. events = {}
  491. for event_name in event_names:
  492. if isinstance(event_name, tuple):
  493. name, tag = event_name
  494. event_name = objects.InstanceExternalEvent.make_key(
  495. name, tag)
  496. events[event_name] = (
  497. self._compute.instance_events.prepare_for_instance_event(
  498. instance, event_name))
  499. yield
  500. with eventlet.timeout.Timeout(deadline):
  501. for event_name, event in events.items():
  502. actual_event = event.wait()
  503. if actual_event.status == 'completed':
  504. continue
  505. decision = error_callback(event_name, instance)
  506. if decision is False:
  507. break
  508. class ComputeManager(manager.Manager):
  509. """Manages the running instances from creation to destruction."""
  510. target = messaging.Target(version='3.35')
  511. # How long to wait in seconds before re-issuing a shutdown
  512. # signal to a instance during power off. The overall
  513. # time to wait is set by CONF.shutdown_timeout.
  514. SHUTDOWN_RETRY_INTERVAL = 10
  515. def __init__(self, compute_driver=None, *args, **kwargs):
  516. """Load configuration options and connect to the hypervisor."""
  517. self.virtapi = ComputeVirtAPI(self)
  518. self.network_api = network.API()
  519. self.volume_api = volume.API()
  520. self.image_api = image.API()
  521. self._last_host_check = 0
  522. self._last_bw_usage_poll = 0
  523. self._bw_usage_supported = True
  524. self._last_bw_usage_cell_update = 0
  525. self.compute_api = compute.API()
  526. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  527. self.conductor_api = conductor.API()
  528. self.compute_task_api = conductor.ComputeTaskAPI()
  529. self.is_neutron_security_groups = (
  530. openstack_driver.is_neutron_security_groups())
  531. self.consoleauth_rpcapi = consoleauth.rpcapi.ConsoleAuthAPI()
  532. self.cells_rpcapi = cells_rpcapi.CellsAPI()
  533. self.scheduler_rpcapi = scheduler_rpcapi.SchedulerAPI()
  534. self._resource_tracker_dict = {}
  535. self.instance_events = InstanceEvents()
  536. self._sync_power_pool = eventlet.GreenPool()
  537. self._syncs_in_progress = {}
  538. super(ComputeManager, self).__init__(service_name="compute",
  539. *args, **kwargs)
  540. # NOTE(russellb) Load the driver last. It may call back into the
  541. # compute manager via the virtapi, so we want it to be fully
  542. # initialized before that happens.
  543. self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
  544. self.use_legacy_block_device_info = \
  545. self.driver.need_legacy_block_device_info
  546. def _get_resource_tracker(self, nodename):
  547. rt = self._resource_tracker_dict.get(nodename)
  548. if not rt:
  549. if not self.driver.node_is_available(nodename):
  550. raise exception.NovaException(
  551. _("%s is not a valid node managed by this "
  552. "compute host.") % nodename)
  553. rt = resource_tracker.ResourceTracker(self.host,
  554. self.driver,
  555. nodename)
  556. self._resource_tracker_dict[nodename] = rt
  557. return rt
  558. def _update_resource_tracker(self, context, instance):
  559. """Let the resource tracker know that an instance has changed state."""
  560. if (instance['host'] == self.host and
  561. self.driver.node_is_available(instance['node'])):
  562. rt = self._get_resource_tracker(instance.get('node'))
  563. rt.update_usage(context, instance)
  564. def _instance_update(self, context, instance_uuid, **kwargs):
  565. """Update an instance in the database using kwargs as value."""
  566. instance_ref = self.conductor_api.instance_update(context,
  567. instance_uuid,
  568. **kwargs)
  569. self._update_resource_tracker(context, instance_ref)
  570. return instance_ref
  571. def _set_instance_error_state(self, context, instance):
  572. instance_uuid = instance['uuid']
  573. try:
  574. self._instance_update(context, instance_uuid,
  575. vm_state=vm_states.ERROR)
  576. except exception.InstanceNotFound:
  577. LOG.debug('Instance has been destroyed from under us while '
  578. 'trying to set it to ERROR',
  579. instance_uuid=instance_uuid)
  580. def _set_instance_obj_error_state(self, context, instance):
  581. try:
  582. instance.vm_state = vm_states.ERROR
  583. instance.save()
  584. except exception.InstanceNotFound:
  585. LOG.debug('Instance has been destroyed from under us while '
  586. 'trying to set it to ERROR', instance=instance)
  587. def _get_instances_on_driver(self, context, filters=None):
  588. """Return a list of instance records for the instances found
  589. on the hypervisor which satisfy the specified filters. If filters=None
  590. return a list of instance records for all the instances found on the
  591. hypervisor.
  592. """
  593. if not filters:
  594. filters = {}
  595. try:
  596. driver_uuids = self.driver.list_instance_uuids()
  597. if len(driver_uuids) == 0:
  598. # Short circuit, don't waste a DB call
  599. return objects.InstanceList()
  600. filters['uuid'] = driver_uuids
  601. local_instances = objects.InstanceList.get_by_filters(
  602. context, filters, use_slave=True)
  603. return local_instances
  604. except NotImplementedError:
  605. pass
  606. # The driver doesn't support uuids listing, so we'll have
  607. # to brute force.
  608. driver_instances = self.driver.list_instances()
  609. instances = objects.InstanceList.get_by_filters(context, filters,
  610. use_slave=True)
  611. name_map = dict((instance.name, instance) for instance in instances)
  612. local_instances = []
  613. for driver_instance in driver_instances:
  614. instance = name_map.get(driver_instance)
  615. if not instance:
  616. continue
  617. local_instances.append(instance)
  618. return local_instances
  619. def _destroy_evacuated_instances(self, context):
  620. """Destroys evacuated instances.
  621. While nova-compute was down, the instances running on it could be
  622. evacuated to another host. Check that the instances reported
  623. by the driver are still associated with this host. If they are
  624. not, destroy them, with the exception of instances which are in
  625. the MIGRATING, RESIZE_MIGRATING, RESIZE_MIGRATED, RESIZE_FINISH
  626. task state or RESIZED vm state.
  627. """
  628. our_host = self.host
  629. filters = {'deleted': False}
  630. local_instances = self._get_instances_on_driver(context, filters)
  631. for instance in local_instances:
  632. if instance.host != our_host:
  633. if (instance.task_state in [task_states.MIGRATING,
  634. task_states.RESIZE_MIGRATING,
  635. task_states.RESIZE_MIGRATED,
  636. task_states.RESIZE_FINISH]
  637. or instance.vm_state in [vm_states.RESIZED]):
  638. LOG.debug('Will not delete instance as its host ('
  639. '%(instance_host)s) is not equal to our '
  640. 'host (%(our_host)s) but its task state is '
  641. '(%(task_state)s) and vm state is '
  642. '(%(vm_state)s)',
  643. {'instance_host': instance.host,
  644. 'our_host': our_host,
  645. 'task_state': instance.task_state,
  646. 'vm_state': instance.vm_state},
  647. instance=instance)
  648. continue
  649. LOG.info(_('Deleting instance as its host ('
  650. '%(instance_host)s) is not equal to our '
  651. 'host (%(our_host)s).'),
  652. {'instance_host': instance.host,
  653. 'our_host': our_host}, instance=instance)
  654. try:
  655. network_info = self._get_instance_nw_info(context,
  656. instance)
  657. bdi = self._get_instance_block_device_info(context,
  658. instance)
  659. destroy_disks = not (self._is_instance_storage_shared(
  660. context, instance))
  661. except exception.InstanceNotFound:
  662. network_info = network_model.NetworkInfo()
  663. bdi = {}
  664. LOG.info(_('Instance has been marked deleted already, '
  665. 'removing it from the hypervisor.'),
  666. instance=instance)
  667. # always destroy disks if the instance was deleted
  668. destroy_disks = True
  669. self.driver.destroy(context, instance,
  670. network_info,
  671. bdi, destroy_disks)
  672. def _is_instance_storage_shared(self, context, instance):
  673. shared_storage = True
  674. data = None
  675. try:
  676. data = self.driver.check_instance_shared_storage_local(context,
  677. instance)
  678. if data:
  679. shared_storage = (self.compute_rpcapi.
  680. check_instance_shared_storage(context,
  681. instance, data))
  682. except NotImplementedError:
  683. LOG.warning(_('Hypervisor driver does not support '
  684. 'instance shared storage check, '
  685. 'assuming it\'s not on shared storage'),
  686. instance=instance)
  687. shared_storage = False
  688. except Exception:
  689. LOG.exception(_LE('Failed to check if instance shared'),
  690. instance=instance)
  691. finally:
  692. if data:
  693. self.driver.check_instance_shared_storage_cleanup(context,
  694. data)
  695. return shared_storage
  696. def _complete_partial_deletion(self, context, instance):
  697. """Complete deletion for instances in DELETED status but not marked as
  698. deleted in the DB
  699. """
  700. instance.destroy()
  701. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  702. context, instance.uuid)
  703. quotas = objects.Quotas(context)
  704. project_id, user_id = quotas_obj.ids_from_instance(context, instance)
  705. quotas.reserve(context, project_id=project_id, user_id=user_id,
  706. instances=-1, cores=-instance.vcpus,
  707. ram=-instance.memory_mb)
  708. self._complete_deletion(context,
  709. instance,
  710. bdms,
  711. quotas,
  712. instance.system_metadata)
  713. def _complete_deletion(self, context, instance, bdms,
  714. quotas, system_meta):
  715. if quotas:
  716. quotas.commit()
  717. # ensure block device mappings are not leaked
  718. for bdm in bdms:
  719. bdm.destroy()
  720. self._notify_about_instance_usage(context, instance, "delete.end",
  721. system_metadata=system_meta)
  722. if CONF.vnc_enabled or CONF.spice.enabled:
  723. if CONF.cells.enable:
  724. self.cells_rpcapi.consoleauth_delete_tokens(context,
  725. instance.uuid)
  726. else:
  727. self.consoleauth_rpcapi.delete_tokens_for_instance(context,
  728. instance.uuid)
  729. def _init_instance(self, context, instance):
  730. '''Initialize this instance during service init.'''
  731. # Instances that are shut down, or in an error state can not be
  732. # initialized and are not attempted to be recovered. The exception
  733. # to this are instances that are in RESIZE_MIGRATING or DELETING,
  734. # which are dealt with further down.
  735. if (instance.vm_state == vm_states.SOFT_DELETED or
  736. (instance.vm_state == vm_states.ERROR and
  737. instance.task_state not in
  738. (task_states.RESIZE_MIGRATING, task_states.DELETING))):
  739. LOG.debug("Instance is in %s state.",
  740. instance.vm_state, instance=instance)
  741. return
  742. if instance.vm_state == vm_states.DELETED:
  743. try:
  744. self._complete_partial_deletion(context, instance)
  745. except Exception:
  746. # we don't want that an exception blocks the init_host
  747. msg = _LE('Failed to complete a deletion')
  748. LOG.exception(msg, instance=instance)
  749. return
  750. if (instance.vm_state == vm_states.BUILDING or
  751. instance.task_state in [task_states.SCHEDULING,
  752. task_states.BLOCK_DEVICE_MAPPING,
  753. task_states.NETWORKING,
  754. task_states.SPAWNING]):
  755. # NOTE(dave-mcnally) compute stopped before instance was fully
  756. # spawned so set to ERROR state. This is safe to do as the state
  757. # may be set by the api but the host is not so if we get here the
  758. # instance has already been scheduled to this particular host.
  759. LOG.debug("Instance failed to spawn correctly, "
  760. "setting to ERROR state", instance=instance)
  761. instance.task_state = None
  762. instance.vm_state = vm_states.ERROR
  763. instance.save()
  764. return
  765. if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
  766. instance.task_state in [task_states.REBUILDING,
  767. task_states.REBUILD_BLOCK_DEVICE_MAPPING,
  768. task_states.REBUILD_SPAWNING]):
  769. # NOTE(jichenjc) compute stopped before instance was fully
  770. # spawned so set to ERROR state. This is consistent to BUILD
  771. LOG.debug("Instance failed to rebuild correctly, "
  772. "setting to ERROR state", instance=instance)
  773. instance.task_state = None
  774. instance.vm_state = vm_states.ERROR
  775. instance.save()
  776. return
  777. if (instance.vm_state != vm_states.ERROR and
  778. instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
  779. task_states.IMAGE_PENDING_UPLOAD,
  780. task_states.IMAGE_UPLOADING,
  781. task_states.IMAGE_SNAPSHOT]):
  782. LOG.debug("Instance in transitional state %s at start-up "
  783. "clearing task state",
  784. instance['task_state'], instance=instance)
  785. try:
  786. self._post_interrupted_snapshot_cleanup(context, instance)
  787. except Exception:
  788. # we don't want that an exception blocks the init_host
  789. msg = _LE('Failed to cleanup snapshot.')
  790. LOG.exception(msg, instance=instance)
  791. instance.task_state = None
  792. instance.save()
  793. if instance.task_state == task_states.DELETING:
  794. try:
  795. LOG.info(_('Service started deleting the instance during '
  796. 'the previous run, but did not finish. Restarting '
  797. 'the deletion now.'), instance=instance)
  798. instance.obj_load_attr('metadata')
  799. instance.obj_load_attr('system_metadata')
  800. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  801. context, instance.uuid)
  802. # FIXME(comstud): This needs fixed. We should be creating
  803. # reservations and updating quotas, because quotas
  804. # wouldn't have been updated for this instance since it is
  805. # still in DELETING. See bug 1296414.
  806. #
  807. # Create a dummy quota object for now.
  808. quotas = objects.Quotas.from_reservations(
  809. context, None, instance=instance)
  810. self._delete_instance(context, instance, bdms, quotas)
  811. except Exception:
  812. # we don't want that an exception blocks the init_host
  813. msg = _LE('Failed to complete a deletion')
  814. LOG.exception(msg, instance=instance)
  815. self._set_instance_error_state(context, instance)
  816. return
  817. try_reboot, reboot_type = self._retry_reboot(context, instance)
  818. current_power_state = self._get_power_state(context, instance)
  819. if try_reboot:
  820. LOG.debug("Instance in transitional state (%(task_state)s) at "
  821. "start-up and power state is (%(power_state)s), "
  822. "triggering reboot",
  823. {'task_state': instance['task_state'],
  824. 'power_state': current_power_state},
  825. instance=instance)
  826. self.compute_rpcapi.reboot_instance(context, instance,
  827. block_device_info=None,
  828. reboot_type=reboot_type)
  829. return
  830. elif (current_power_state == power_state.RUNNING and
  831. instance.task_state in [task_states.REBOOT_STARTED,
  832. task_states.REBOOT_STARTED_HARD]):
  833. LOG.warning(_("Instance in transitional state "
  834. "(%(task_state)s) at start-up and power state "
  835. "is (%(power_state)s), clearing task state"),
  836. {'task_state': instance['task_state'],
  837. 'power_state': current_power_state},
  838. instance=instance)
  839. instance.task_state = None
  840. instance.vm_state = vm_states.ACTIVE
  841. instance.save()
  842. if instance.task_state == task_states.POWERING_OFF:
  843. try:
  844. LOG.debug("Instance in transitional state %s at start-up "
  845. "retrying stop request",
  846. instance['task_state'], instance=instance)
  847. self.stop_instance(context, instance)
  848. except Exception:
  849. # we don't want that an exception blocks the init_host
  850. msg = _LE('Failed to stop instance')
  851. LOG.exception(msg, instance=instance)
  852. return
  853. if instance.task_state == task_states.POWERING_ON:
  854. try:
  855. LOG.debug("Instance in transitional state %s at start-up "
  856. "retrying start request",
  857. instance['task_state'], instance=instance)
  858. self.start_instance(context, instance)
  859. except Exception:
  860. # we don't want that an exception blocks the init_host
  861. msg = _LE('Failed to start instance')
  862. LOG.exception(msg, instance=instance)
  863. return
  864. net_info = compute_utils.get_nw_info_for_instance(instance)
  865. try:
  866. self.driver.plug_vifs(instance, net_info)
  867. except NotImplementedError as e:
  868. LOG.debug(e, instance=instance)
  869. if instance.task_state == task_states.RESIZE_MIGRATING:
  870. # We crashed during resize/migration, so roll back for safety
  871. try:
  872. # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
  873. # not in system_metadata we default to True for backwards
  874. # compatibility
  875. power_on = (instance.system_metadata.get('old_vm_state') !=
  876. vm_states.STOPPED)
  877. block_dev_info = self._get_instance_block_device_info(context,
  878. instance)
  879. self.driver.finish_revert_migration(context,
  880. instance, net_info, block_dev_info, power_on)
  881. except Exception as e:
  882. LOG.exception(_LE('Failed to revert crashed migration'),
  883. instance=instance)
  884. finally:
  885. LOG.info(_('Instance found in migrating state during '
  886. 'startup. Resetting task_state'),
  887. instance=instance)
  888. instance.task_state = None
  889. instance.save()
  890. if instance.task_state == task_states.MIGRATING:
  891. # Live migration did not complete, but instance is on this
  892. # host, so reset the state.
  893. instance.task_state = None
  894. instance.save(expected_task_state=[task_states.MIGRATING])
  895. db_state = instance.power_state
  896. drv_state = self._get_power_state(context, instance)
  897. expect_running = (db_state == power_state.RUNNING and
  898. drv_state != db_state)
  899. LOG.debug('Current state is %(drv_state)s, state in DB is '
  900. '%(db_state)s.',
  901. {'drv_state': drv_state, 'db_state': db_state},
  902. instance=instance)
  903. if expect_running and CONF.resume_guests_state_on_host_boot:
  904. LOG.info(_('Rebooting instance after nova-compute restart.'),
  905. instance=instance)
  906. block_device_info = \
  907. self._get_instance_block_device_info(context, instance)
  908. try:
  909. self.driver.resume_state_on_host_boot(
  910. context, instance, net_info, block_device_info)
  911. except NotImplementedError:
  912. LOG.warning(_('Hypervisor driver does not support '
  913. 'resume guests'), instance=instance)
  914. except Exception:
  915. # NOTE(vish): The instance failed to resume, so we set the
  916. # instance to error and attempt to continue.
  917. LOG.warning(_('Failed to resume instance'), instance=instance)
  918. self._set_instance_error_state(context, instance)
  919. elif drv_state == power_state.RUNNING:
  920. # VMwareAPI drivers will raise an exception
  921. try:
  922. self.driver.ensure_filtering_rules_for_instance(
  923. instance, net_info)
  924. except NotImplementedError:
  925. LOG.warning(_('Hypervisor driver does not support '
  926. 'firewall rules'), instance=instance)
  927. def _retry_reboot(self, context, instance):
  928. current_power_state = self._get_power_state(context, instance)
  929. current_task_state = instance.task_state
  930. retry_reboot = False
  931. reboot_type = compute_utils.get_reboot_type(current_task_state,
  932. current_power_state)
  933. pending_soft = (current_task_state == task_states.REBOOT_PENDING and
  934. instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
  935. pending_hard = (current_task_state == task_states.REBOOT_PENDING_HARD
  936. and instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
  937. started_not_running = (current_task_state in
  938. [task_states.REBOOT_STARTED,
  939. task_states.REBOOT_STARTED_HARD] and
  940. current_power_state != power_state.RUNNING)
  941. if pending_soft or pending_hard or started_not_running:
  942. retry_reboot = True
  943. return retry_reboot, reboot_type
  944. def handle_lifecycle_event(self, event):
  945. LOG.info(_("VM %(state)s (Lifecycle Event)") %
  946. {'state': event.get_name()},
  947. instance_uuid=event.get_instance_uuid())
  948. context = nova.context.get_admin_context(read_deleted='yes')
  949. instance = objects.Instance.get_by_uuid(context,
  950. event.get_instance_uuid())
  951. vm_power_state = None
  952. if event.get_transition() == virtevent.EVENT_LIFECYCLE_STOPPED:
  953. vm_power_state = power_state.SHUTDOWN
  954. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_STARTED:
  955. vm_power_state = power_state.RUNNING
  956. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_PAUSED:
  957. vm_power_state = power_state.PAUSED
  958. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_RESUMED:
  959. vm_power_state = power_state.RUNNING
  960. else:
  961. LOG.warning(_("Unexpected power state %d") %
  962. event.get_transition())
  963. if vm_power_state is not None:
  964. LOG.debug('Synchronizing instance power state after lifecycle '
  965. 'event "%(event)s"; current vm_state: %(vm_state)s, '
  966. 'current task_state: %(task_state)s, current DB '
  967. 'power_state: %(db_power_state)s, VM power_state: '
  968. '%(vm_power_state)s',
  969. dict(event=event.get_name(),
  970. vm_state=instance.vm_state,
  971. task_state=instance.task_state,
  972. db_power_state=instance.power_state,
  973. vm_power_state=vm_power_state),
  974. instance_uuid=instance.uuid)
  975. self._sync_instance_power_state(context,
  976. instance,
  977. vm_power_state)
  978. def handle_events(self, event):
  979. if isinstance(event, virtevent.LifecycleEvent):
  980. try:
  981. self.handle_lifecycle_event(event)
  982. except exception.InstanceNotFound:
  983. LOG.debug("Event %s arrived for non-existent instance. The "
  984. "instance was probably deleted.", event)
  985. else:
  986. LOG.debug("Ignoring event %s", event)
  987. def init_virt_events(self):
  988. self.driver.register_event_listener(self.handle_events)
  989. def init_host(self):
  990. """Initialization for a standalone compute service."""
  991. self.driver.init_host(host=self.host)
  992. context = nova.context.get_admin_context()
  993. instances = objects.InstanceList.get_by_host(
  994. context, self.host, expected_attrs=['info_cache'])
  995. if CONF.defer_iptables_apply:
  996. self.driver.filter_defer_apply_on()
  997. self.init_virt_events()
  998. try:
  999. # checking that instance was not already evacuated to other host
  1000. self._destroy_evacuated_instances(context)
  1001. for instance in instances:
  1002. self._init_instance(context, instance)
  1003. finally:
  1004. if CONF.defer_iptables_apply:
  1005. self.driver.filter_defer_apply_off()
  1006. def cleanup_host(self):
  1007. self.driver.cleanup_host(host=self.host)
  1008. def pre_start_hook(self):
  1009. """After the service is initialized, but before we fully bring
  1010. the service up by listening on RPC queues, make sure to update
  1011. our available resources (and indirectly our available nodes).
  1012. """
  1013. self.update_available_resource(nova.context.get_admin_context())
  1014. def _get_power_state(self, context, instance):
  1015. """Retrieve the power state for the given instance."""
  1016. LOG.debug('Checking state', instance=instance)
  1017. try:
  1018. return self.driver.get_info(instance)["state"]
  1019. except exception.NotFound:
  1020. return power_state.NOSTATE
  1021. def get_console_topic(self, context):
  1022. """Retrieves the console host for a project on this host.
  1023. Currently this is just set in the flags for each compute host.
  1024. """
  1025. # TODO(mdragon): perhaps make this variable by console_type?
  1026. return '%s.%s' % (CONF.console_topic, CONF.console_host)
  1027. def get_console_pool_info(self, context, console_type):
  1028. return self.driver.get_console_pool_info(console_type)
  1029. @wrap_exception()
  1030. def refresh_security_group_rules(self, context, security_group_id):
  1031. """Tell the virtualization driver to refresh security group rules.
  1032. Passes straight through to the virtualization driver.
  1033. """
  1034. return self.driver.refresh_security_group_rules(security_group_id)
  1035. @wrap_exception()
  1036. def refresh_security_group_members(self, context, security_group_id):
  1037. """Tell the virtualization driver to refresh security group members.
  1038. Passes straight through to the virtualization driver.
  1039. """
  1040. return self.driver.refresh_security_group_members(security_group_id)
  1041. @wrap_exception()
  1042. def refresh_instance_security_rules(self, context, instance):
  1043. """Tell the virtualization driver to refresh security rules for
  1044. an instance.
  1045. Passes straight through to the virtualization driver.
  1046. Synchronise the call because we may still be in the middle of
  1047. creating the instance.
  1048. """
  1049. @utils.synchronized(instance['uuid'])
  1050. def _sync_refresh():
  1051. try:
  1052. return self.driver.refresh_instance_security_rules(instance)
  1053. except NotImplementedError:
  1054. LOG.warning(_('Hypervisor driver does not support '
  1055. 'security groups.'), instance=instance)
  1056. return _sync_refresh()
  1057. @wrap_exception()
  1058. def refresh_provider_fw_rules(self, context):
  1059. """This call passes straight through to the virtualization driver."""
  1060. return self.driver.refresh_provider_fw_rules()
  1061. def _get_instance_nw_info(self, context, instance, use_slave=False):
  1062. """Get a list of dictionaries of network data of an instance."""
  1063. if (not hasattr(instance, 'system_metadata') or
  1064. len(instance['system_metadata']) == 0):
  1065. # NOTE(danms): Several places in the code look up instances without
  1066. # pulling system_metadata for performance, and call this function.
  1067. # If we get an instance without it, re-fetch so that the call
  1068. # to network_api (which requires it for instance_type) will
  1069. # succeed.
  1070. instance = objects.Instance.get_by_uuid(context,
  1071. instance['uuid'],
  1072. use_slave=use_slave)
  1073. network_info = self.network_api.get_instance_nw_info(context,
  1074. instance)
  1075. return network_info
  1076. def _await_block_device_map_created(self, context, vol_id):
  1077. # TODO(yamahata): creating volume simultaneously
  1078. # reduces creation time?
  1079. # TODO(yamahata): eliminate dumb polling
  1080. start = time.time()
  1081. retries = CONF.block_device_allocate_retries
  1082. if retries < 0:
  1083. LOG.warn(_LW("Treating negative config value (%(retries)s) for "
  1084. "'block_device_retries' as 0."),
  1085. {'retries': retries})
  1086. # (1) treat negative config value as 0
  1087. # (2) the configured value is 0, one attempt should be made
  1088. # (3) the configured value is > 0, then the total number attempts
  1089. # is (retries + 1)
  1090. attempts = 1
  1091. if retries >= 1:
  1092. attempts = retries + 1
  1093. for attempt in range(1, attempts + 1):
  1094. volume = self.volume_api.get(context, vol_id)
  1095. volume_status = volume['status']
  1096. if volume_status not in ['creating', 'downloading']:
  1097. if volume_status != 'available':
  1098. LOG.warn(_("Volume id: %s finished being created but was"
  1099. " not set as 'available'"), vol_id)
  1100. return attempt
  1101. greenthread.sleep(CONF.block_device_allocate_retries_interval)
  1102. # NOTE(harlowja): Should only happen if we ran out of attempts
  1103. raise exception.VolumeNotCreated(volume_id=vol_id,
  1104. seconds=int(time.time() - start),
  1105. attempts=attempts)
  1106. def _decode_files(self, injected_files):
  1107. """Base64 decode the list of files to inject."""
  1108. if not injected_files:
  1109. return []
  1110. def _decode(f):
  1111. path, contents = f
  1112. try:
  1113. decoded = base64.b64decode(contents)
  1114. return path, decoded
  1115. except TypeError:
  1116. raise exception.Base64Exception(path=path)
  1117. return [_decode(f) for f in injected_files]
  1118. def _run_instance(self, context, request_spec,
  1119. filter_properties, requested_networks, injected_files,
  1120. admin_password, is_first_time, node, instance,
  1121. legacy_bdm_in_spec):
  1122. """Launch a new instance with specified options."""
  1123. extra_usage_info = {}
  1124. def notify(status, msg="", fault=None, **kwargs):
  1125. """Send a create.{start,error,end} notification."""
  1126. type_ = "create.%(status)s" % dict(status=status)
  1127. info = extra_usage_info.copy()
  1128. info['message'] = msg
  1129. self._notify_about_instance_usage(context, instance, type_,
  1130. extra_usage_info=info, fault=fault, **kwargs)
  1131. try:
  1132. self._prebuild_instance(context, instance)
  1133. if request_spec and request_spec.get('image'):
  1134. image_meta = request_spec['image']
  1135. else:
  1136. image_meta = {}
  1137. extra_usage_info = {"image_name": image_meta.get('name', '')}
  1138. notify("start") # notify that build is starting
  1139. instance, network_info = self._build_instance(context,
  1140. request_spec, filter_properties, requested_networks,
  1141. injected_files, admin_password, is_first_time, node,
  1142. instance, image_meta, legacy_bdm_in_spec)
  1143. notify("end", msg=_("Success"), network_info=network_info)
  1144. except exception.RescheduledException as e:
  1145. # Instance build encountered an error, and has been rescheduled.
  1146. notify("error", fault=e)
  1147. except exception.BuildAbortException as e:
  1148. # Instance build aborted due to a non-failure
  1149. LOG.info(e)
  1150. notify("end", msg=e.format_message()) # notify that build is done
  1151. except Exception as e:
  1152. # Instance build encountered a non-recoverable error:
  1153. with excutils.save_and_reraise_exception():
  1154. self._set_instance_error_state(context, instance)
  1155. notify("error", fault=e) # notify that build failed
  1156. def _prebuild_instance(self, context, instance):
  1157. self._check_instance_exists(context, instance)
  1158. try:
  1159. self._start_building(context, instance)
  1160. except (exception.InstanceNotFound,
  1161. exception.UnexpectedDeletingTaskStateError):
  1162. msg = _("Instance disappeared before we could start it")
  1163. # Quickly bail out of here
  1164. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1165. reason=msg)
  1166. def _validate_instance_group_policy(self, context, instance,
  1167. filter_properties):
  1168. # NOTE(russellb) Instance group policy is enforced by the scheduler.
  1169. # However, there is a race condition with the enforcement of
  1170. # anti-affinity. Since more than one instance may be scheduled at the
  1171. # same time, it's possible that more than one instance with an
  1172. # anti-affinity policy may end up here. This is a validation step to
  1173. # make sure that starting the instance here doesn't violate the policy.
  1174. scheduler_hints = filter_properties.get('scheduler_hints') or {}
  1175. group_hint = scheduler_hints.get('group')
  1176. if not group_hint:
  1177. return
  1178. @utils.synchronized(group_hint)
  1179. def _do_validation(context, instance, group_hint):
  1180. group = objects.InstanceGroup.get_by_hint(context, group_hint)
  1181. if 'anti-affinity' not in group.policies:
  1182. return
  1183. group_hosts = group.get_hosts(context, exclude=[instance.uuid])
  1184. if self.host in group_hosts:
  1185. msg = _("Anti-affinity instance group policy was violated.")
  1186. raise exception.RescheduledException(
  1187. instance_uuid=instance.uuid,
  1188. reason=msg)
  1189. _do_validation(context, instance, group_hint)
  1190. def _build_instance(self, context, request_spec, filter_properties,
  1191. requested_networks, injected_files, admin_password, is_first_time,
  1192. node, instance, image_meta, legacy_bdm_in_spec):
  1193. original_context = context
  1194. context = context.elevated()
  1195. # NOTE(danms): This method is deprecated, but could be called,
  1196. # and if it is, it will have an old megatuple for requested_networks.
  1197. if requested_networks is not None:
  1198. requested_networks_obj = objects.NetworkRequestList(
  1199. objects=[objects.NetworkRequest.from_tuple(t)
  1200. for t in requested_networks])
  1201. else:
  1202. requested_networks_obj = None
  1203. # If neutron security groups pass requested security
  1204. # groups to allocate_for_instance()
  1205. if request_spec and self.is_neutron_security_groups:
  1206. security_groups = request_spec.get('security_group')
  1207. else:
  1208. security_groups = []
  1209. if node is None:
  1210. node = self.driver.get_available_nodes(refresh=True)[0]
  1211. LOG.debug("No node specified, defaulting to %s", node)
  1212. network_info = None
  1213. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  1214. context, instance.uuid)
  1215. # b64 decode the files to inject:
  1216. injected_files_orig = injected_files
  1217. injected_files = self._decode_files(injected_files)
  1218. rt = self._get_resource_tracker(node)
  1219. try:
  1220. limits = filter_properties.get('limits', {})
  1221. with rt.instance_claim(context, instance, limits):
  1222. # NOTE(russellb) It's important that this validation be done
  1223. # *after* the resource tracker instance claim, as that is where
  1224. # the host is set on the instance.
  1225. self._validate_instance_group_policy(context, instance,
  1226. filter_properties)
  1227. macs = self.driver.macs_for_instance(instance)
  1228. dhcp_options = self.driver.dhcp_options_for_instance(instance)
  1229. network_info = self._allocate_network(original_context,
  1230. instance, requested_networks_obj, macs,
  1231. security_groups, dhcp_options)
  1232. # Verify that all the BDMs have a device_name set and assign a
  1233. # default to the ones missing it with the help of the driver.
  1234. self._default_block_device_names(context, instance, image_meta,
  1235. bdms)
  1236. instance.vm_state = vm_states.BUILDING
  1237. instance.task_state = task_states.BLOCK_DEVICE_MAPPING
  1238. instance.save()
  1239. block_device_info = self._prep_block_device(
  1240. context, instance, bdms)
  1241. set_access_ip = (is_first_time and
  1242. not instance.access_ip_v4 and
  1243. not instance.access_ip_v6)
  1244. instance = self._spawn(context, instance, image_meta,
  1245. network_info, block_device_info,
  1246. injected_files, admin_password,
  1247. set_access_ip=set_access_ip)
  1248. except (exception.InstanceNotFound,
  1249. exception.UnexpectedDeletingTaskStateError):
  1250. # the instance got deleted during the spawn
  1251. # Make sure the async call finishes
  1252. if network_info is not None:
  1253. network_info.wait(do_raise=False)
  1254. try:
  1255. self._deallocate_network(context, instance)
  1256. except Exception:
  1257. msg = _LE('Failed to dealloc network '
  1258. 'for deleted instance')
  1259. LOG.exception(msg, instance=instance)
  1260. raise exception.BuildAbortException(
  1261. instance_uuid=instance.uuid,
  1262. reason=_("Instance disappeared during build"))
  1263. except (exception.UnexpectedTaskStateError,
  1264. exception.VirtualInterfaceCreateException) as e:
  1265. # Don't try to reschedule, just log and reraise.
  1266. with excutils.save_and_reraise_exception():
  1267. LOG.debug(e.format_message(), instance=instance)
  1268. # Make sure the async call finishes
  1269. if network_info is not None:
  1270. network_info.wait(do_raise=False)
  1271. except exception.InvalidBDM:
  1272. with excutils.save_and_reraise_exception():
  1273. if network_info is not None:
  1274. network_info.wait(do_raise=False)
  1275. try:
  1276. self._deallocate_network(context, instance)
  1277. except Exception:
  1278. msg = _LE('Failed to dealloc network '
  1279. 'for failed instance')
  1280. LOG.exception(msg, instance=instance)
  1281. except Exception:
  1282. exc_info = sys.exc_info()
  1283. # try to re-schedule instance:
  1284. # Make sure the async call finishes
  1285. if network_info is not None:
  1286. network_info.wait(do_raise=False)
  1287. rescheduled = self._reschedule_or_error(original_context, instance,
  1288. exc_info, requested_networks, admin_password,
  1289. injected_files_orig, is_first_time, request_spec,
  1290. filter_properties, bdms, legacy_bdm_in_spec)
  1291. if rescheduled:
  1292. # log the original build error
  1293. self._log_original_error(exc_info, instance.uuid)
  1294. raise exception.RescheduledException(
  1295. instance_uuid=instance.uuid,
  1296. reason=six.text_type(exc_info[1]))
  1297. else:
  1298. # not re-scheduling, go to error:
  1299. raise exc_info[0], exc_info[1], exc_info[2]
  1300. # spawn success
  1301. return instance, network_info
  1302. def _log_original_error(self, exc_info, instance_uuid):
  1303. LOG.error(_LE('Error: %s'), exc_info[1], instance_uuid=instance_uuid,
  1304. exc_info=exc_info)
  1305. def _reschedule_or_error(self, context, instance, exc_info,
  1306. requested_networks, admin_password, injected_files, is_first_time,
  1307. request_spec, filter_properties, bdms=None,
  1308. legacy_bdm_in_spec=True):
  1309. """Try to re-schedule the build or re-raise the original build error to
  1310. error out the instance.
  1311. """
  1312. original_context = context
  1313. context = context.elevated()
  1314. instance_uuid = instance.uuid
  1315. rescheduled = False
  1316. compute_utils.add_instance_fault_from_exc(context,
  1317. instance, exc_info[1], exc_info=exc_info)
  1318. self._notify_about_instance_usage(context, instance,
  1319. 'instance.create.error', fault=exc_info[1])
  1320. try:
  1321. LOG.debug("Clean up resource before rescheduling.",
  1322. instance=instance)
  1323. if bdms is None:
  1324. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  1325. context, instance.uuid)
  1326. self._shutdown_instance(context, instance,
  1327. bdms, requested_networks)
  1328. self._cleanup_volumes(context, instance.uuid, bdms)
  1329. except Exception:
  1330. # do not attempt retry if clean up failed:
  1331. with excutils.save_and_reraise_exception():
  1332. self._log_original_error(exc_info, instance_uuid)
  1333. try:
  1334. method_args = (request_spec, admin_password, injected_files,
  1335. requested_networks, is_first_time, filter_properties,
  1336. legacy_bdm_in_spec)
  1337. task_state = task_states.SCHEDULING
  1338. rescheduled = self._reschedule(original_context, request_spec,
  1339. filter_properties, instance,
  1340. self.scheduler_rpcapi.run_instance, method_args,
  1341. task_state, exc_info)
  1342. except Exception:
  1343. rescheduled = False
  1344. LOG.exception(_LE("Error trying to reschedule"),
  1345. instance_uuid=instance_uuid)
  1346. return rescheduled
  1347. def _reschedule(self, context, request_spec, filter_properties,
  1348. instance, reschedule_method, method_args, task_state,
  1349. exc_info=None):
  1350. """Attempt to re-schedule a compute operation."""
  1351. instance_uuid = instance.uuid
  1352. retry = filter_properties.get('retry', None)
  1353. if not retry:
  1354. # no retry information, do not reschedule.
  1355. LOG.debug("Retry info not present, will not reschedule",
  1356. instance_uuid=instance_uuid)
  1357. return
  1358. if not request_spec:
  1359. LOG.debug("No request spec, will not reschedule",
  1360. instance_uuid=instance_uuid)
  1361. return
  1362. request_spec['instance_uuids'] = [instance_uuid]
  1363. LOG.debug("Re-scheduling %(method)s: attempt %(num)d",
  1364. {'method': reschedule_method.func_name,
  1365. 'num': retry['num_attempts']}, instance_uuid=instance_uuid)
  1366. # reset the task state:
  1367. self._instance_update(context, instance_uuid, task_state=task_state)
  1368. if exc_info:
  1369. # stringify to avoid circular ref problem in json serialization:
  1370. retry['exc'] = traceback.format_exception_only(exc_info[0],
  1371. exc_info[1])
  1372. reschedule_method(context, *method_args)
  1373. return True
  1374. @periodic_task.periodic_task
  1375. def _check_instance_build_time(self, context):
  1376. """Ensure that instances are not stuck in build."""
  1377. timeout = CONF.instance_build_timeout
  1378. if timeout == 0:
  1379. return
  1380. filters = {'vm_state': vm_states.BUILDING,
  1381. 'host': self.host}
  1382. building_insts = objects.InstanceList.get_by_filters(context,
  1383. filters, expected_attrs=[], use_slave=True)
  1384. for instance in building_insts:
  1385. if timeutils.is_older_than(instance['created_at'], timeout):
  1386. self._set_instance_error_state(context, instance)
  1387. LOG.warn(_("Instance build timed out. Set to error state."),
  1388. instance=instance)
  1389. def _check_instance_exists(self, context, instance):
  1390. """Ensure an instance with the same name is not already present."""
  1391. if self.driver.instance_exists(instance):
  1392. raise exception.InstanceExists(name=instance.name)
  1393. def _start_building(self, context, instance):
  1394. """Save the host and launched_on fields and log appropriately."""
  1395. LOG.audit(_('Starting instance...'), context=context,
  1396. instance=instance)
  1397. self._instance_update(context, instance.uuid,
  1398. vm_state=vm_states.BUILDING,
  1399. task_state=None,
  1400. expected_task_state=(task_states.SCHEDULING,
  1401. None))
  1402. def _allocate_network_async(self, context, instance, requested_networks,
  1403. macs, security_groups, is_vpn, dhcp_options):
  1404. """Method used to allocate networks in the background.
  1405. Broken out for testing.
  1406. """
  1407. LOG.debug("Allocating IP information in the background.",
  1408. instance=instance)
  1409. retries = CONF.network_allocate_retries
  1410. if retries < 0:
  1411. LOG.warn(_("Treating negative config value (%(retries)s) for "
  1412. "'network_allocate_retries' as 0."),
  1413. {'retries': retries})
  1414. retries = 0
  1415. attempts = retries + 1
  1416. retry_time = 1
  1417. for attempt in range(1, attempts + 1):
  1418. try:
  1419. nwinfo = self.network_api.allocate_for_instance(
  1420. context, instance, vpn=is_vpn,
  1421. requested_networks=requested_networks,
  1422. macs=macs,
  1423. security_groups=security_groups,
  1424. dhcp_options=dhcp_options)
  1425. LOG.debug('Instance network_info: |%s|', nwinfo,
  1426. instance=instance)
  1427. sys_meta = instance.system_metadata
  1428. sys_meta['network_allocated'] = 'True'
  1429. self._instance_update(context, instance.uuid,
  1430. system_metadata=sys_meta)
  1431. return nwinfo
  1432. except Exception:
  1433. exc_info = sys.exc_info()
  1434. log_info = {'attempt': attempt,
  1435. 'attempts': attempts}
  1436. if attempt == attempts:
  1437. LOG.exception(_LE('Instance failed network setup '
  1438. 'after %(attempts)d attempt(s)'),
  1439. log_info)
  1440. raise exc_info[0], exc_info[1], exc_info[2]
  1441. LOG.warn(_('Instance failed network setup '
  1442. '(attempt %(attempt)d of %(attempts)d)'),
  1443. log_info, instance=instance)
  1444. time.sleep(retry_time)
  1445. retry_time *= 2
  1446. if retry_time > 30:
  1447. retry_time = 30
  1448. # Not reached.
  1449. def _build_networks_for_instance(self, context, instance,
  1450. requested_networks, security_groups):
  1451. # If we're here from a reschedule the network may already be allocated.
  1452. if strutils.bool_from_string(
  1453. instance.system_metadata.get('network_allocated', 'False')):
  1454. return self._get_instance_nw_info(context, instance)
  1455. if not self.is_neutron_security_groups:
  1456. security_groups = []
  1457. macs = self.driver.macs_for_instance(instance)
  1458. dhcp_options = self.driver.dhcp_options_for_instance(instance)
  1459. network_info = self._allocate_network(context, instance,
  1460. requested_networks, macs, security_groups, dhcp_options)
  1461. if not instance.access_ip_v4 and not instance.access_ip_v6:
  1462. # If CONF.default_access_ip_network_name is set, grab the
  1463. # corresponding network and set the access ip values accordingly.
  1464. # Note that when there are multiple ips to choose from, an
  1465. # arbitrary one will be chosen.
  1466. network_name = CONF.default_access_ip_network_name
  1467. if not network_name:
  1468. return network_info
  1469. for vif in network_info:
  1470. if vif['network']['label'] == network_name:
  1471. for ip in vif.fixed_ips():
  1472. if ip['version'] == 4:
  1473. instance.access_ip_v4 = ip['address']
  1474. if ip['version'] == 6:
  1475. instance.access_ip_v6 = ip['address']
  1476. instance.save()
  1477. break
  1478. return network_info
  1479. def _allocate_network(self, context, instance, requested_networks, macs,
  1480. security_groups, dhcp_options):
  1481. """Start network allocation asynchronously. Return an instance
  1482. of NetworkInfoAsyncWrapper that can be used to retrieve the
  1483. allocated networks when the operation has finished.
  1484. """
  1485. # NOTE(comstud): Since we're allocating networks asynchronously,
  1486. # this task state has little meaning, as we won't be in this
  1487. # state for very long.
  1488. instance.vm_state = vm_states.BUILDING
  1489. instance.task_state = task_states.NETWORKING
  1490. instance.save(expected_task_state=[None])
  1491. self._update_resource_tracker(context, instance)
  1492. is_vpn = pipelib.is_vpn_image(instance.image_ref)
  1493. return network_model.NetworkInfoAsyncWrapper(
  1494. self._allocate_network_async, context, instance,
  1495. requested_networks, macs, security_groups, is_vpn,
  1496. dhcp_options)
  1497. def _default_root_device_name(self, instance, image_meta, root_bdm):
  1498. try:
  1499. return self.driver.default_root_device_name(instance,
  1500. image_meta,
  1501. root_bdm)
  1502. except NotImplementedError:
  1503. return compute_utils.get_next_device_name(instance, [])
  1504. def _default_device_names_for_instance(self, instance,
  1505. root_device_name,
  1506. *block_device_lists):
  1507. try:
  1508. self.driver.default_device_names_for_instance(instance,
  1509. root_device_name,
  1510. *block_device_lists)
  1511. except NotImplementedError:
  1512. compute_utils.default_device_names_for_instance(
  1513. instance, root_device_name, *block_device_lists)
  1514. def _default_block_device_names(self, context, instance,
  1515. image_meta, block_devices):
  1516. """Verify that all the devices have the device_name set. If not,
  1517. provide a default name.
  1518. It also ensures that there is a root_device_name and is set to the
  1519. first block device in the boot sequence (boot_index=0).
  1520. """
  1521. root_bdm = block_device.get_root_bdm(block_devices)
  1522. if not root_bdm:
  1523. return
  1524. # Get the root_device_name from the root BDM or the instance
  1525. root_device_name = None
  1526. update_root_bdm = False
  1527. if root_bdm.device_name:
  1528. root_device_name = root_bdm.device_name
  1529. instance.root_device_name = root_device_name
  1530. elif instance.root_device_name:
  1531. root_device_name = instance.root_device_name
  1532. root_bdm.device_name = root_device_name
  1533. update_root_bdm = True
  1534. else:
  1535. root_device_name = self._default_root_device_name(instance,
  1536. image_meta,
  1537. root_bdm)
  1538. instance.root_device_name = root_device_name
  1539. root_bdm.device_name = root_device_name
  1540. update_root_bdm = True
  1541. if update_root_bdm:
  1542. root_bdm.save()
  1543. ephemerals = filter(block_device.new_format_is_ephemeral,
  1544. block_devices)
  1545. swap = filter(block_device.new_format_is_swap,
  1546. block_devices)
  1547. block_device_mapping = filter(
  1548. driver_block_device.is_block_device_mapping, block_devices)
  1549. self._default_device_names_for_instance(instance,
  1550. root_device_name,
  1551. ephemerals,
  1552. swap,
  1553. block_device_mapping)
  1554. def _prep_block_device(self, context, instance, bdms,
  1555. do_check_attach=True):
  1556. """Set up the block device for an instance with error logging."""
  1557. try:
  1558. block_device_info = {
  1559. 'root_device_name': instance['root_device_name'],
  1560. 'swap': driver_block_device.convert_swap(bdms),
  1561. 'ephemerals': driver_block_device.convert_ephemerals(bdms),
  1562. 'block_device_mapping': (
  1563. driver_block_device.attach_block_devices(
  1564. driver_block_device.convert_volumes(bdms),
  1565. context, instance, self.volume_api,
  1566. self.driver, do_check_attach=do_check_attach) +
  1567. driver_block_device.attach_block_devices(
  1568. driver_block_device.convert_snapshots(bdms),
  1569. context, instance, self.volume_api,
  1570. self.driver, self._await_block_device_map_created,
  1571. do_check_attach=do_check_attach) +
  1572. driver_block_device.attach_block_devices(
  1573. driver_block_device.convert_images(bdms),
  1574. context, instance, self.volume_api,
  1575. self.driver, self._await_block_device_map_created,
  1576. do_check_attach=do_check_attach) +
  1577. driver_block_device.attach_block_devices(
  1578. driver_block_device.convert_blanks(bdms),
  1579. context, instance, self.volume_api,
  1580. self.driver, self._await_block_device_map_created,
  1581. do_check_attach=do_check_attach))
  1582. }
  1583. if self.use_legacy_block_device_info:
  1584. for bdm_type in ('swap', 'ephemerals', 'block_device_mapping'):
  1585. block_device_info[bdm_type] = \
  1586. driver_block_device.legacy_block_devices(
  1587. block_device_info[bdm_type])
  1588. # Get swap out of the list
  1589. block_device_info['swap'] = driver_block_device.get_swap(
  1590. block_device_info['swap'])
  1591. return block_device_info
  1592. except exception.OverQuota:
  1593. msg = _LW('Failed to create block device for instance due to '
  1594. 'being over volume resource quota')
  1595. LOG.warn(msg, instance=instance)
  1596. raise exception.InvalidBDM()
  1597. except Exception:
  1598. LOG.exception(_LE('Instance failed block device setup'),
  1599. instance=instance)
  1600. raise exception.InvalidBDM()
  1601. @object_compat
  1602. def _spawn(self, context, instance, image_meta, network_info,
  1603. block_device_info, injected_files, admin_password,
  1604. set_access_ip=False):
  1605. """Spawn an instance with error logging and update its power state."""
  1606. instance.vm_state = vm_states.BUILDING
  1607. instance.task_state = task_states.SPAWNING
  1608. instance.save(expected_task_state=task_states.BLOCK_DEVICE_MAPPING)
  1609. try:
  1610. self.driver.spawn(context, instance, image_meta,
  1611. injected_files, admin_password,
  1612. network_info,
  1613. block_device_info)
  1614. except Exception:
  1615. with excutils.save_and_reraise_exception():
  1616. LOG.exception(_LE('Instance failed to spawn'),
  1617. instance=instance)
  1618. current_power_state = self._get_power_state(context, instance)
  1619. instance.power_state = current_power_state
  1620. instance.vm_state = vm_states.ACTIVE
  1621. instance.task_state = None
  1622. instance.launched_at = timeutils.utcnow()
  1623. def _set_access_ip_values():
  1624. """Add access ip values for a given instance.
  1625. If CONF.default_access_ip_network_name is set, this method will
  1626. grab the corresponding network and set the access ip values
  1627. accordingly. Note that when there are multiple ips to choose
  1628. from, an arbitrary one will be chosen.
  1629. """
  1630. network_name = CONF.default_access_ip_network_name
  1631. if not network_name:
  1632. return
  1633. for vif in network_info:
  1634. if vif['network']['label'] == network_name:
  1635. for ip in vif.fixed_ips():
  1636. if ip['version'] == 4:
  1637. instance.access_ip_v4 = ip['address']
  1638. if ip['version'] == 6:
  1639. instance.access_ip_v6 = ip['address']
  1640. return
  1641. if set_access_ip:
  1642. _set_access_ip_values()
  1643. network_info.wait(do_raise=True)
  1644. instance.info_cache.network_info = network_info
  1645. instance.save(expected_task_state=task_states.SPAWNING)
  1646. return instance
  1647. def _notify_about_instance_usage(self, context, instance, event_suffix,
  1648. network_info=None, system_metadata=None,
  1649. extra_usage_info=None, fault=None):
  1650. compute_utils.notify_about_instance_usage(
  1651. self.notifier, context, instance, event_suffix,
  1652. network_info=network_info,
  1653. system_metadata=system_metadata,
  1654. extra_usage_info=extra_usage_info, fault=fault)
  1655. def _deallocate_network(self, context, instance,
  1656. requested_networks=None):
  1657. LOG.debug('Deallocating network for instance', instance=instance)
  1658. self.network_api.deallocate_for_instance(
  1659. context, instance, requested_networks=requested_networks)
  1660. def _get_instance_block_device_info(self, context, instance,
  1661. refresh_conn_info=False,
  1662. bdms=None):
  1663. """Transform block devices to the driver block_device format."""
  1664. if not bdms:
  1665. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  1666. context, instance['uuid'])
  1667. swap = driver_block_device.convert_swap(bdms)
  1668. ephemerals = driver_block_device.convert_ephemerals(bdms)
  1669. block_device_mapping = (
  1670. driver_block_device.convert_volumes(bdms) +
  1671. driver_block_device.convert_snapshots(bdms) +
  1672. driver_block_device.convert_images(bdms))
  1673. if not refresh_conn_info:
  1674. # if the block_device_mapping has no value in connection_info
  1675. # (returned as None), don't include in the mapping
  1676. block_device_mapping = [
  1677. bdm for bdm in block_device_mapping
  1678. if bdm.get('connection_info')]
  1679. else:
  1680. block_device_mapping = driver_block_device.refresh_conn_infos(
  1681. block_device_mapping, context, instance, self.volume_api,
  1682. self.driver)
  1683. if self.use_legacy_block_device_info:
  1684. swap = driver_block_device.legacy_block_devices(swap)
  1685. ephemerals = driver_block_device.legacy_block_devices(ephemerals)
  1686. block_device_mapping = driver_block_device.legacy_block_devices(
  1687. block_device_mapping)
  1688. # Get swap out of the list
  1689. swap = driver_block_device.get_swap(swap)
  1690. return {'swap': swap,
  1691. 'ephemerals': ephemerals,
  1692. 'block_device_mapping': block_device_mapping}
  1693. # NOTE(mikal): No object_compat wrapper on this method because its
  1694. # callers all pass objects already
  1695. @wrap_exception()
  1696. @reverts_task_state
  1697. @wrap_instance_fault
  1698. def build_and_run_instance(self, context, instance, image, request_spec,
  1699. filter_properties, admin_password=None,
  1700. injected_files=None, requested_networks=None,
  1701. security_groups=None, block_device_mapping=None,
  1702. node=None, limits=None):
  1703. # NOTE(danms): Remove this in v4.0 of the RPC API
  1704. if (requested_networks and
  1705. not isinstance(requested_networks,
  1706. objects.NetworkRequestList)):
  1707. requested_networks = objects.NetworkRequestList(
  1708. objects=[objects.NetworkRequest.from_tuple(t)
  1709. for t in requested_networks])
  1710. @utils.synchronized(instance.uuid)
  1711. def _locked_do_build_and_run_instance(*args, **kwargs):
  1712. self._do_build_and_run_instance(*args, **kwargs)
  1713. # NOTE(danms): We spawn here to return the RPC worker thread back to
  1714. # the pool. Since what follows could take a really long time, we don't
  1715. # want to tie up RPC workers.
  1716. utils.spawn_n(_locked_do_build_and_run_instance,
  1717. context, instance, image, request_spec,
  1718. filter_properties, admin_password, injected_files,
  1719. requested_networks, security_groups,
  1720. block_device_mapping, node, limits)
  1721. @wrap_exception()
  1722. @reverts_task_state
  1723. @wrap_instance_event
  1724. @wrap_instance_fault
  1725. def _do_build_and_run_instance(self, context, instance, image,
  1726. request_spec, filter_properties, admin_password, injected_files,
  1727. requested_networks, security_groups, block_device_mapping,
  1728. node=None, limits=None):
  1729. try:
  1730. LOG.audit(_('Starting instance...'), context=context,
  1731. instance=instance)
  1732. instance.vm_state = vm_states.BUILDING
  1733. instance.task_state = None
  1734. instance.save(expected_task_state=
  1735. (task_states.SCHEDULING, None))
  1736. except exception.InstanceNotFound:
  1737. msg = 'Instance disappeared before build.'
  1738. LOG.debug(msg, instance=instance)
  1739. return
  1740. except exception.UnexpectedTaskStateError as e:
  1741. LOG.debug(e.format_message(), instance=instance)
  1742. return
  1743. # b64 decode the files to inject:
  1744. decoded_files = self._decode_files(injected_files)
  1745. if limits is None:
  1746. limits = {}
  1747. if node is None:
  1748. node = self.driver.get_available_nodes(refresh=True)[0]
  1749. LOG.debug('No node specified, defaulting to %s', node,
  1750. instance=instance)
  1751. try:
  1752. self._build_and_run_instance(context, instance, image,
  1753. decoded_files, admin_password, requested_networks,
  1754. security_groups, block_device_mapping, node, limits,
  1755. filter_properties)
  1756. except exception.RescheduledException as e:
  1757. LOG.debug(e.format_message(), instance=instance)
  1758. retry = filter_properties.get('retry', None)
  1759. if not retry:
  1760. # no retry information, do not reschedule.
  1761. LOG.debug("Retry info not present, will not reschedule",
  1762. instance=instance)
  1763. self._cleanup_allocated_networks(context, instance,
  1764. requested_networks)
  1765. compute_utils.add_instance_fault_from_exc(context,
  1766. instance, e, sys.exc_info())
  1767. self._set_instance_error_state(context, instance)
  1768. return
  1769. retry['exc'] = traceback.format_exception(*sys.exc_info())
  1770. # NOTE(comstud): Deallocate networks if the driver wants
  1771. # us to do so.
  1772. if self.driver.deallocate_networks_on_reschedule(instance):
  1773. self._cleanup_allocated_networks(context, instance,
  1774. requested_networks)
  1775. instance.task_state = task_states.SCHEDULING
  1776. instance.save()
  1777. self.compute_task_api.build_instances(context, [instance],
  1778. image, filter_properties, admin_password,
  1779. injected_files, requested_networks, security_groups,
  1780. block_device_mapping)
  1781. except (exception.InstanceNotFound,
  1782. exception.UnexpectedDeletingTaskStateError):
  1783. msg = 'Instance disappeared during build.'
  1784. LOG.debug(msg, instance=instance)
  1785. self._cleanup_allocated_networks(context, instance,
  1786. requested_networks)
  1787. except exception.BuildAbortException as e:
  1788. LOG.exception(e.format_message(), instance=instance)
  1789. self._cleanup_allocated_networks(context, instance,
  1790. requested_networks)
  1791. self._cleanup_volumes(context, instance.uuid,
  1792. block_device_mapping, raise_exc=False)
  1793. compute_utils.add_instance_fault_from_exc(context, instance,
  1794. e, sys.exc_info())
  1795. self._set_instance_error_state(context, instance)
  1796. except Exception as e:
  1797. # Should not reach here.
  1798. msg = _LE('Unexpected build failure, not rescheduling build.')
  1799. LOG.exception(msg, instance=instance)
  1800. self._cleanup_allocated_networks(context, instance,
  1801. requested_networks)
  1802. self._cleanup_volumes(context, instance.uuid,
  1803. block_device_mapping, raise_exc=False)
  1804. compute_utils.add_instance_fault_from_exc(context, instance,
  1805. e, sys.exc_info())
  1806. self._set_instance_error_state(context, instance)
  1807. def _build_and_run_instance(self, context, instance, image, injected_files,
  1808. admin_password, requested_networks, security_groups,
  1809. block_device_mapping, node, limits, filter_properties):
  1810. image_name = image.get('name')
  1811. self._notify_about_instance_usage(context, instance, 'create.start',
  1812. extra_usage_info={'image_name': image_name})
  1813. try:
  1814. rt = self._get_resource_tracker(node)
  1815. with rt.instance_claim(context, instance, limits):
  1816. # NOTE(russellb) It's important that this validation be done
  1817. # *after* the resource tracker instance claim, as that is where
  1818. # the host is set on the instance.
  1819. self._validate_instance_group_policy(context, instance,
  1820. filter_properties)
  1821. with self._build_resources(context, instance,
  1822. requested_networks, security_groups, image,
  1823. block_device_mapping) as resources:
  1824. instance.vm_state = vm_states.BUILDING
  1825. instance.task_state = task_states.SPAWNING
  1826. instance.save(expected_task_state=
  1827. task_states.BLOCK_DEVICE_MAPPING)
  1828. block_device_info = resources['block_device_info']
  1829. network_info = resources['network_info']
  1830. self.driver.spawn(context, instance, image,
  1831. injected_files, admin_password,
  1832. network_info=network_info,
  1833. block_device_info=block_device_info)
  1834. except (exception.InstanceNotFound,
  1835. exception.UnexpectedDeletingTaskStateError) as e:
  1836. with excutils.save_and_reraise_exception():
  1837. self._notify_about_instance_usage(context, instance,
  1838. 'create.end', fault=e)
  1839. except exception.ComputeResourcesUnavailable as e:
  1840. LOG.debug(e.format_message(), instance=instance)
  1841. self._notify_about_instance_usage(context, instance,
  1842. 'create.error', fault=e)
  1843. raise exception.RescheduledException(
  1844. instance_uuid=instance.uuid, reason=e.format_message())
  1845. except exception.BuildAbortException as e:
  1846. with excutils.save_and_reraise_exception():
  1847. LOG.debug(e.format_message(), instance=instance)
  1848. self._notify_about_instance_usage(context, instance,
  1849. 'create.error', fault=e)
  1850. except (exception.FixedIpLimitExceeded,
  1851. exception.NoMoreNetworks) as e:
  1852. LOG.warn(_LW('No more network or fixed IP to be allocated'),
  1853. instance=instance)
  1854. self._notify_about_instance_usage(context, instance,
  1855. 'create.error', fault=e)
  1856. msg = _('Failed to allocate the network(s) with error %s, '
  1857. 'not rescheduling.') % e.format_message()
  1858. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1859. reason=msg)
  1860. except (exception.VirtualInterfaceCreateException,
  1861. exception.VirtualInterfaceMacAddressException) as e:
  1862. LOG.exception(_LE('Failed to allocate network(s)'),
  1863. instance=instance)
  1864. self._notify_about_instance_usage(context, instance,
  1865. 'create.error', fault=e)
  1866. msg = _('Failed to allocate the network(s), not rescheduling.')
  1867. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1868. reason=msg)
  1869. except (exception.FlavorDiskTooSmall,
  1870. exception.FlavorMemoryTooSmall,
  1871. exception.ImageNotActive,
  1872. exception.ImageUnacceptable) as e:
  1873. self._notify_about_instance_usage(context, instance,
  1874. 'create.error', fault=e)
  1875. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1876. reason=e.format_message())
  1877. except Exception as e:
  1878. self._notify_about_instance_usage(context, instance,
  1879. 'create.error', fault=e)
  1880. raise exception.RescheduledException(
  1881. instance_uuid=instance.uuid, reason=six.text_type(e))
  1882. # NOTE(alaski): This is only useful during reschedules, remove it now.
  1883. instance.system_metadata.pop('network_allocated', None)
  1884. instance.power_state = self._get_power_state(context, instance)
  1885. instance.vm_state = vm_states.ACTIVE
  1886. instance.task_state = None
  1887. instance.launched_at = timeutils.utcnow()
  1888. try:
  1889. instance.save(expected_task_state=task_states.SPAWNING)
  1890. except (exception.InstanceNotFound,
  1891. exception.UnexpectedDeletingTaskStateError) as e:
  1892. with excutils.save_and_reraise_exception():
  1893. self._notify_about_instance_usage(context, instance,
  1894. 'create.end', fault=e)
  1895. self._notify_about_instance_usage(context, instance, 'create.end',
  1896. extra_usage_info={'message': _('Success')},
  1897. network_info=network_info)
  1898. @contextlib.contextmanager
  1899. def _build_resources(self, context, instance, requested_networks,
  1900. security_groups, image, block_device_mapping):
  1901. resources = {}
  1902. network_info = None
  1903. try:
  1904. network_info = self._build_networks_for_instance(context, instance,
  1905. requested_networks, security_groups)
  1906. resources['network_info'] = network_info
  1907. except (exception.InstanceNotFound,
  1908. exception.UnexpectedDeletingTaskStateError):
  1909. raise
  1910. except exception.UnexpectedTaskStateError as e:
  1911. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1912. reason=e.format_message())
  1913. except Exception:
  1914. # Because this allocation is async any failures are likely to occur
  1915. # when the driver accesses network_info during spawn().
  1916. LOG.exception(_LE('Failed to allocate network(s)'),
  1917. instance=instance)
  1918. msg = _('Failed to allocate the network(s), not rescheduling.')
  1919. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1920. reason=msg)
  1921. try:
  1922. # Verify that all the BDMs have a device_name set and assign a
  1923. # default to the ones missing it with the help of the driver.
  1924. self._default_block_device_names(context, instance, image,
  1925. block_device_mapping)
  1926. instance.vm_state = vm_states.BUILDING
  1927. instance.task_state = task_states.BLOCK_DEVICE_MAPPING
  1928. instance.save()
  1929. block_device_info = self._prep_block_device(context, instance,
  1930. block_device_mapping)
  1931. resources['block_device_info'] = block_device_info
  1932. except (exception.InstanceNotFound,
  1933. exception.UnexpectedDeletingTaskStateError):
  1934. with excutils.save_and_reraise_exception() as ctxt:
  1935. # Make sure the async call finishes
  1936. if network_info is not None:
  1937. network_info.wait(do_raise=False)
  1938. except exception.UnexpectedTaskStateError as e:
  1939. # Make sure the async call finishes
  1940. if network_info is not None:
  1941. network_info.wait(do_raise=False)
  1942. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1943. reason=e.format_message())
  1944. except Exception:
  1945. LOG.exception(_LE('Failure prepping block device'),
  1946. instance=instance)
  1947. # Make sure the async call finishes
  1948. if network_info is not None:
  1949. network_info.wait(do_raise=False)
  1950. msg = _('Failure prepping block device.')
  1951. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1952. reason=msg)
  1953. try:
  1954. yield resources
  1955. except Exception as exc:
  1956. with excutils.save_and_reraise_exception() as ctxt:
  1957. if not isinstance(exc, (exception.InstanceNotFound,
  1958. exception.UnexpectedDeletingTaskStateError)):
  1959. LOG.exception(_LE('Instance failed to spawn'),
  1960. instance=instance)
  1961. # Make sure the async call finishes
  1962. if network_info is not None:
  1963. network_info.wait(do_raise=False)
  1964. try:
  1965. self._shutdown_instance(context, instance,
  1966. block_device_mapping, requested_networks,
  1967. try_deallocate_networks=False)
  1968. except Exception:
  1969. ctxt.reraise = False
  1970. msg = _('Could not clean up failed build,'
  1971. ' not rescheduling')
  1972. raise exception.BuildAbortException(
  1973. instance_uuid=instance.uuid, reason=msg)
  1974. def _cleanup_allocated_networks(self, context, instance,
  1975. requested_networks):
  1976. try:
  1977. self._deallocate_network(context, instance, requested_networks)
  1978. except Exception:
  1979. msg = _LE('Failed to deallocate networks')
  1980. LOG.exception(msg, instance=instance)
  1981. return
  1982. instance.system_metadata['network_allocated'] = 'False'
  1983. try:
  1984. instance.save()
  1985. except exception.InstanceNotFound:
  1986. # NOTE(alaski): It's possible that we're cleaning up the networks
  1987. # because the instance was deleted. If that's the case then this
  1988. # exception will be raised by instance.save()
  1989. pass
  1990. @object_compat
  1991. @messaging.expected_exceptions(exception.BuildAbortException,
  1992. exception.UnexpectedTaskStateError,
  1993. exception.VirtualInterfaceCreateException,
  1994. exception.RescheduledException)
  1995. @wrap_exception()
  1996. @reverts_task_state
  1997. @wrap_instance_event
  1998. @wrap_instance_fault
  1999. def run_instance(self, context, instance, request_spec,
  2000. filter_properties, requested_networks,
  2001. injected_files, admin_password,
  2002. is_first_time, node, legacy_bdm_in_spec):
  2003. # NOTE(alaski) This method should be deprecated when the scheduler and
  2004. # compute rpc interfaces are bumped to 4.x, and slated for removal in
  2005. # 5.x as it is no longer used.
  2006. if filter_properties is None:
  2007. filter_properties = {}
  2008. @utils.synchronized(instance.uuid)
  2009. def do_run_instance():
  2010. self._run_instance(context, request_spec,
  2011. filter_properties, requested_networks, injected_files,
  2012. admin_password, is_first_time, node, instance,
  2013. legacy_bdm_in_spec)
  2014. do_run_instance()
  2015. def _try_deallocate_network(self, context, instance,
  2016. requested_networks=None):
  2017. try:
  2018. # tear down allocated network structure
  2019. self._deallocate_network(context, instance, requested_networks)
  2020. except Exception:
  2021. with excutils.save_and_reraise_exception():
  2022. LOG.error(_LE('Failed to deallocate network for instance.'),
  2023. instance=instance)
  2024. self._set_instance_error_state(context, instance)
  2025. def _get_power_off_values(self, context, instance, clean_shutdown):
  2026. """Get the timing configuration for powering down this instance."""
  2027. if clean_shutdown:
  2028. timeout = compute_utils.get_value_from_system_metadata(instance,
  2029. key='image_os_shutdown_timeout', type=int,
  2030. default=CONF.shutdown_timeout)
  2031. retry_interval = self.SHUTDOWN_RETRY_INTERVAL
  2032. else:
  2033. timeout = 0
  2034. retry_interval = 0
  2035. return timeout, retry_interval
  2036. def _power_off_instance(self, context, instance, clean_shutdown=True):
  2037. """Power off an instance on this host."""
  2038. timeout, retry_interval = self._get_power_off_values(context,
  2039. instance, clean_shutdown)
  2040. self.driver.power_off(instance, timeout, retry_interval)
  2041. def _shutdown_instance(self, context, instance,
  2042. bdms, requested_networks=None, notify=True,
  2043. try_deallocate_networks=True):
  2044. """Shutdown an instance on this host.
  2045. :param:context: security context
  2046. :param:instance: a nova.objects.Instance object
  2047. :param:bdms: the block devices for the instance to be torn
  2048. down
  2049. :param:requested_networks: the networks on which the instance
  2050. has ports
  2051. :param:notify: true if a final usage notification should be
  2052. emitted
  2053. :param:try_deallocate_networks: false if we should avoid
  2054. trying to teardown networking
  2055. """
  2056. context = context.elevated()
  2057. LOG.audit(_('%(action_str)s instance') % {'action_str': 'Terminating'},
  2058. context=context, instance=instance)
  2059. if notify:
  2060. self._notify_about_instance_usage(context, instance,
  2061. "shutdown.start")
  2062. network_info = compute_utils.get_nw_info_for_instance(instance)
  2063. # NOTE(vish) get bdms before destroying the instance
  2064. vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
  2065. block_device_info = self._get_instance_block_device_info(
  2066. context, instance, bdms=bdms)
  2067. # NOTE(melwitt): attempt driver destroy before releasing ip, may
  2068. # want to keep ip allocated for certain failures
  2069. try:
  2070. self.driver.destroy(context, instance, network_info,
  2071. block_device_info)
  2072. except exception.InstancePowerOffFailure:
  2073. # if the instance can't power off, don't release the ip
  2074. with excutils.save_and_reraise_exception():
  2075. pass
  2076. except Exception:
  2077. with excutils.save_and_reraise_exception():
  2078. # deallocate ip and fail without proceeding to
  2079. # volume api calls, preserving current behavior
  2080. if try_deallocate_networks:
  2081. self._try_deallocate_network(context, instance,
  2082. requested_networks)
  2083. if try_deallocate_networks:
  2084. self._try_deallocate_network(context, instance, requested_networks)
  2085. for bdm in vol_bdms:
  2086. try:
  2087. # NOTE(vish): actual driver detach done in driver.destroy, so
  2088. # just tell cinder that we are done with it.
  2089. connector = self.driver.get_volume_connector(instance)
  2090. self.volume_api.terminate_connection(context,
  2091. bdm.volume_id,
  2092. connector)
  2093. self.volume_api.detach(context, bdm.volume_id)
  2094. except exception.DiskNotFound as exc:
  2095. LOG.debug('Ignoring DiskNotFound: %s', exc,
  2096. instance=instance)
  2097. except exception.VolumeNotFound as exc:
  2098. LOG.debug('Ignoring VolumeNotFound: %s', exc,
  2099. instance=instance)
  2100. except cinder_exception.EndpointNotFound as exc:
  2101. LOG.warn(_LW('Ignoring EndpointNotFound: %s'), exc,
  2102. instance=instance)
  2103. if notify:
  2104. self._notify_about_instance_usage(context, instance,
  2105. "shutdown.end")
  2106. def _cleanup_volumes(self, context, instance_uuid, bdms, raise_exc=True):
  2107. exc_info = None
  2108. for bdm in bdms:
  2109. LOG.debug("terminating bdm %s", bdm,
  2110. instance_uuid=instance_uuid)
  2111. if bdm.volume_id and bdm.delete_on_termination:
  2112. try:
  2113. self.volume_api.delete(context, bdm.volume_id)
  2114. except Exception as exc:
  2115. exc_info = sys.exc_info()
  2116. LOG.warn(_LW('Failed to delete volume: %(volume_id)s due '
  2117. 'to %(exc)s'), {'volume_id': bdm.volume_id,
  2118. 'exc': exc})
  2119. if exc_info is not None and raise_exc:
  2120. six.reraise(exc_info[0], exc_info[1], exc_info[2])
  2121. @hooks.add_hook("delete_instance")
  2122. def _delete_instance(self, context, instance, bdms, quotas):
  2123. """Delete an instance on this host. Commit or rollback quotas
  2124. as necessary.
  2125. """
  2126. was_soft_deleted = instance['vm_state'] == vm_states.SOFT_DELETED
  2127. if was_soft_deleted:
  2128. # Instances in SOFT_DELETED vm_state have already had quotas
  2129. # decremented.
  2130. try:
  2131. quotas.rollback()
  2132. except Exception:
  2133. pass
  2134. try:
  2135. events = self.instance_events.clear_events_for_instance(instance)
  2136. if events:
  2137. LOG.debug('Events pending at deletion: %(events)s',
  2138. {'events': ','.join(events.keys())},
  2139. instance=instance)
  2140. instance.info_cache.delete()
  2141. self._notify_about_instance_usage(context, instance,
  2142. "delete.start")
  2143. self._shutdown_instance(context, instance, bdms)
  2144. # NOTE(vish): We have already deleted the instance, so we have
  2145. # to ignore problems cleaning up the volumes. It
  2146. # would be nice to let the user know somehow that
  2147. # the volume deletion failed, but it is not
  2148. # acceptable to have an instance that can not be
  2149. # deleted. Perhaps this could be reworked in the
  2150. # future to set an instance fault the first time
  2151. # and to only ignore the failure if the instance
  2152. # is already in ERROR.
  2153. self._cleanup_volumes(context, instance.uuid, bdms,
  2154. raise_exc=False)
  2155. # if a delete task succeed, always update vm state and task
  2156. # state without expecting task state to be DELETING
  2157. instance.vm_state = vm_states.DELETED
  2158. instance.task_state = None
  2159. instance.terminated_at = timeutils.utcnow()
  2160. instance.save()
  2161. self._update_resource_tracker(context, instance)
  2162. system_meta = instance.system_metadata
  2163. instance.destroy()
  2164. except Exception:
  2165. with excutils.save_and_reraise_exception():
  2166. quotas.rollback()
  2167. self._complete_deletion(context,
  2168. instance,
  2169. bdms,
  2170. quotas,
  2171. system_meta)
  2172. @wrap_exception()
  2173. @reverts_task_state
  2174. @wrap_instance_event
  2175. @wrap_instance_fault
  2176. def terminate_instance(self, context, instance, bdms, reservations):
  2177. """Terminate an instance on this host."""
  2178. # NOTE (ndipanov): If we get non-object BDMs, just get them from the
  2179. # db again, as this means they are sent in the old format and we want
  2180. # to avoid converting them back when we can just get them.
  2181. # Remove this when we bump the RPC major version to 4.0
  2182. if (bdms and
  2183. any(not isinstance(bdm, obj_base.NovaObject)
  2184. for bdm in bdms)):
  2185. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2186. context, instance.uuid)
  2187. quotas = objects.Quotas.from_reservations(context,
  2188. reservations,
  2189. instance=instance)
  2190. @utils.synchronized(instance.uuid)
  2191. def do_terminate_instance(instance, bdms):
  2192. try:
  2193. self._delete_instance(context, instance, bdms, quotas)
  2194. except exception.InstanceNotFound:
  2195. LOG.info(_("Instance disappeared during terminate"),
  2196. instance=instance)
  2197. except Exception:
  2198. # As we're trying to delete always go to Error if something
  2199. # goes wrong that _delete_instance can't handle.
  2200. with excutils.save_and_reraise_exception():
  2201. LOG.exception(_LE('Setting instance vm_state to ERROR'),
  2202. instance=instance)
  2203. self._set_instance_error_state(context, instance)
  2204. do_terminate_instance(instance, bdms)
  2205. # NOTE(johannes): This is probably better named power_off_instance
  2206. # so it matches the driver method, but because of other issues, we
  2207. # can't use that name in grizzly.
  2208. @wrap_exception()
  2209. @reverts_task_state
  2210. @wrap_instance_event
  2211. @wrap_instance_fault
  2212. def stop_instance(self, context, instance, clean_shutdown=True):
  2213. """Stopping an instance on this host."""
  2214. @utils.synchronized(instance.uuid)
  2215. def do_stop_instance():
  2216. current_power_state = self._get_power_state(context, instance)
  2217. LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
  2218. 'current task_state: %(task_state)s, current DB '
  2219. 'power_state: %(db_power_state)s, current VM '
  2220. 'power_state: %(current_power_state)s',
  2221. dict(vm_state=instance.vm_state,
  2222. task_state=instance.task_state,
  2223. db_power_state=instance.power_state,
  2224. current_power_state=current_power_state),
  2225. instance_uuid=instance.uuid)
  2226. # NOTE(mriedem): If the instance is already powered off, we are
  2227. # possibly tearing down and racing with other operations, so we can
  2228. # expect the task_state to be None if something else updates the
  2229. # instance and we're not locking it.
  2230. expected_task_state = [task_states.POWERING_OFF]
  2231. # The list of power states is from _sync_instance_power_state.
  2232. if current_power_state in (power_state.NOSTATE,
  2233. power_state.SHUTDOWN,
  2234. power_state.CRASHED):
  2235. LOG.info(_LI('Instance is already powered off in the '
  2236. 'hypervisor when stop is called.'),
  2237. instance=instance)
  2238. expected_task_state.append(None)
  2239. self._notify_about_instance_usage(context, instance,
  2240. "power_off.start")
  2241. self._power_off_instance(context, instance, clean_shutdown)
  2242. current_power_state = self._get_power_state(context, instance)
  2243. instance.power_state = current_power_state
  2244. instance.vm_state = vm_states.STOPPED
  2245. instance.task_state = None
  2246. instance.save(expected_task_state=expected_task_state)
  2247. self._notify_about_instance_usage(context, instance,
  2248. "power_off.end")
  2249. do_stop_instance()
  2250. def _power_on(self, context, instance):
  2251. network_info = self._get_instance_nw_info(context, instance)
  2252. block_device_info = self._get_instance_block_device_info(context,
  2253. instance)
  2254. self.driver.power_on(context, instance,
  2255. network_info,
  2256. block_device_info)
  2257. # NOTE(johannes): This is probably better named power_on_instance
  2258. # so it matches the driver method, but because of other issues, we
  2259. # can't use that name in grizzly.
  2260. @wrap_exception()
  2261. @reverts_task_state
  2262. @wrap_instance_event
  2263. @wrap_instance_fault
  2264. def start_instance(self, context, instance):
  2265. """Starting an instance on this host."""
  2266. self._notify_about_instance_usage(context, instance, "power_on.start")
  2267. self._power_on(context, instance)
  2268. current_power_state = self._get_power_state(context, instance)
  2269. instance.power_state = current_power_state
  2270. instance.vm_state = vm_states.ACTIVE
  2271. instance.task_state = None
  2272. instance.save(expected_task_state=task_states.POWERING_ON)
  2273. self._notify_about_instance_usage(context, instance, "power_on.end")
  2274. @wrap_exception()
  2275. @reverts_task_state
  2276. @wrap_instance_event
  2277. @wrap_instance_fault
  2278. def soft_delete_instance(self, context, instance, reservations):
  2279. """Soft delete an instance on this host."""
  2280. quotas = objects.Quotas.from_reservations(context,
  2281. reservations,
  2282. instance=instance)
  2283. try:
  2284. self._notify_about_instance_usage(context, instance,
  2285. "soft_delete.start")
  2286. try:
  2287. self.driver.soft_delete(instance)
  2288. except NotImplementedError:
  2289. # Fallback to just powering off the instance if the
  2290. # hypervisor doesn't implement the soft_delete method
  2291. self.driver.power_off(instance)
  2292. current_power_state = self._get_power_state(context, instance)
  2293. instance.power_state = current_power_state
  2294. instance.vm_state = vm_states.SOFT_DELETED
  2295. instance.task_state = None
  2296. instance.save(expected_task_state=[task_states.SOFT_DELETING])
  2297. except Exception:
  2298. with excutils.save_and_reraise_exception():
  2299. quotas.rollback()
  2300. quotas.commit()
  2301. self._notify_about_instance_usage(context, instance, "soft_delete.end")
  2302. @object_compat
  2303. @wrap_exception()
  2304. @reverts_task_state
  2305. @wrap_instance_event
  2306. @wrap_instance_fault
  2307. def restore_instance(self, context, instance):
  2308. """Restore a soft-deleted instance on this host."""
  2309. self._notify_about_instance_usage(context, instance, "restore.start")
  2310. try:
  2311. self.driver.restore(instance)
  2312. except NotImplementedError:
  2313. # Fallback to just powering on the instance if the hypervisor
  2314. # doesn't implement the restore method
  2315. self._power_on(context, instance)
  2316. current_power_state = self._get_power_state(context, instance)
  2317. instance.power_state = current_power_state
  2318. instance.vm_state = vm_states.ACTIVE
  2319. instance.task_state = None
  2320. instance.save(expected_task_state=task_states.RESTORING)
  2321. self._notify_about_instance_usage(context, instance, "restore.end")
  2322. def _rebuild_default_impl(self, context, instance, image_meta,
  2323. injected_files, admin_password, bdms,
  2324. detach_block_devices, attach_block_devices,
  2325. network_info=None,
  2326. recreate=False, block_device_info=None,
  2327. preserve_ephemeral=False):
  2328. if preserve_ephemeral:
  2329. # The default code path does not support preserving ephemeral
  2330. # partitions.
  2331. raise exception.PreserveEphemeralNotSupported()
  2332. detach_block_devices(context, bdms)
  2333. if not recreate:
  2334. self.driver.destroy(context, instance, network_info,
  2335. block_device_info=block_device_info)
  2336. instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
  2337. instance.save(expected_task_state=[task_states.REBUILDING])
  2338. new_block_device_info = attach_block_devices(context, instance, bdms)
  2339. instance.task_state = task_states.REBUILD_SPAWNING
  2340. instance.save(
  2341. expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
  2342. self.driver.spawn(context, instance, image_meta, injected_files,
  2343. admin_password, network_info=network_info,
  2344. block_device_info=new_block_device_info)
  2345. @object_compat
  2346. @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)
  2347. @wrap_exception()
  2348. @reverts_task_state
  2349. @wrap_instance_event
  2350. @wrap_instance_fault
  2351. def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
  2352. injected_files, new_pass, orig_sys_metadata,
  2353. bdms, recreate, on_shared_storage,
  2354. preserve_ephemeral=False):
  2355. """Destroy and re-make this instance.
  2356. A 'rebuild' effectively purges all existing data from the system and
  2357. remakes the VM with given 'metadata' and 'personalities'.
  2358. :param context: `nova.RequestContext` object
  2359. :param instance: Instance object
  2360. :param orig_image_ref: Original image_ref before rebuild
  2361. :param image_ref: New image_ref for rebuild
  2362. :param injected_files: Files to inject
  2363. :param new_pass: password to set on rebuilt instance
  2364. :param orig_sys_metadata: instance system metadata from pre-rebuild
  2365. :param bdms: block-device-mappings to use for rebuild
  2366. :param recreate: True if the instance is being recreated (e.g. the
  2367. hypervisor it was on failed) - cleanup of old state will be
  2368. skipped.
  2369. :param on_shared_storage: True if instance files on shared storage
  2370. :param preserve_ephemeral: True if the default ephemeral storage
  2371. partition must be preserved on rebuild
  2372. """
  2373. context = context.elevated()
  2374. # NOTE (ndipanov): If we get non-object BDMs, just get them from the
  2375. # db again, as this means they are sent in the old format and we want
  2376. # to avoid converting them back when we can just get them.
  2377. # Remove this on the next major RPC version bump
  2378. if (bdms and
  2379. any(not isinstance(bdm, obj_base.NovaObject)
  2380. for bdm in bdms)):
  2381. bdms = None
  2382. orig_vm_state = instance.vm_state
  2383. with self._error_out_instance_on_exception(context, instance):
  2384. LOG.audit(_("Rebuilding instance"), context=context,
  2385. instance=instance)
  2386. if recreate:
  2387. if not self.driver.capabilities["supports_recreate"]:
  2388. raise exception.InstanceRecreateNotSupported
  2389. self._check_instance_exists(context, instance)
  2390. # To cover case when admin expects that instance files are on
  2391. # shared storage, but not accessible and vice versa
  2392. if on_shared_storage != self.driver.instance_on_disk(instance):
  2393. raise exception.InvalidSharedStorage(
  2394. _("Invalid state of instance files on shared"
  2395. " storage"))
  2396. if on_shared_storage:
  2397. LOG.info(_('disk on shared storage, recreating using'
  2398. ' existing disk'))
  2399. else:
  2400. image_ref = orig_image_ref = instance.image_ref
  2401. LOG.info(_("disk not on shared storage, rebuilding from:"
  2402. " '%s'") % str(image_ref))
  2403. # NOTE(mriedem): On a recreate (evacuate), we need to update
  2404. # the instance's host and node properties to reflect it's
  2405. # destination node for the recreate.
  2406. node_name = None
  2407. try:
  2408. compute_node = self._get_compute_info(context, self.host)
  2409. node_name = compute_node.hypervisor_hostname
  2410. except exception.NotFound:
  2411. LOG.exception(_LE('Failed to get compute_info for %s'),
  2412. self.host)
  2413. finally:
  2414. instance.host = self.host
  2415. instance.node = node_name
  2416. instance.save()
  2417. if image_ref:
  2418. image_meta = self.image_api.get(context, image_ref)
  2419. else:
  2420. image_meta = {}
  2421. # This instance.exists message should contain the original
  2422. # image_ref, not the new one. Since the DB has been updated
  2423. # to point to the new one... we have to override it.
  2424. # TODO(jaypipes): Move generate_image_url() into the nova.image.api
  2425. orig_image_ref_url = glance.generate_image_url(orig_image_ref)
  2426. extra_usage_info = {'image_ref_url': orig_image_ref_url}
  2427. self.conductor_api.notify_usage_exists(context,
  2428. obj_base.obj_to_primitive(instance),
  2429. current_period=True, system_metadata=orig_sys_metadata,
  2430. extra_usage_info=extra_usage_info)
  2431. # This message should contain the new image_ref
  2432. extra_usage_info = {'image_name': image_meta.get('name', '')}
  2433. self._notify_about_instance_usage(context, instance,
  2434. "rebuild.start", extra_usage_info=extra_usage_info)
  2435. instance.power_state = self._get_power_state(context, instance)
  2436. instance.task_state = task_states.REBUILDING
  2437. instance.save(expected_task_state=[task_states.REBUILDING])
  2438. if recreate:
  2439. self.network_api.setup_networks_on_host(
  2440. context, instance, self.host)
  2441. network_info = compute_utils.get_nw_info_for_instance(instance)
  2442. if bdms is None:
  2443. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2444. context, instance.uuid)
  2445. block_device_info = \
  2446. self._get_instance_block_device_info(
  2447. context, instance, bdms=bdms)
  2448. def detach_block_devices(context, bdms):
  2449. for bdm in bdms:
  2450. if bdm.is_volume:
  2451. self.volume_api.detach(context, bdm.volume_id)
  2452. files = self._decode_files(injected_files)
  2453. kwargs = dict(
  2454. context=context,
  2455. instance=instance,
  2456. image_meta=image_meta,
  2457. injected_files=files,
  2458. admin_password=new_pass,
  2459. bdms=bdms,
  2460. detach_block_devices=detach_block_devices,
  2461. attach_block_devices=self._prep_block_device,
  2462. block_device_info=block_device_info,
  2463. network_info=network_info,
  2464. preserve_ephemeral=preserve_ephemeral,
  2465. recreate=recreate)
  2466. try:
  2467. self.driver.rebuild(**kwargs)
  2468. except NotImplementedError:
  2469. # NOTE(rpodolyaka): driver doesn't provide specialized version
  2470. # of rebuild, fall back to the default implementation
  2471. self._rebuild_default_impl(**kwargs)
  2472. instance.power_state = self._get_power_state(context, instance)
  2473. instance.vm_state = vm_states.ACTIVE
  2474. instance.task_state = None
  2475. instance.launched_at = timeutils.utcnow()
  2476. instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
  2477. if orig_vm_state == vm_states.STOPPED:
  2478. LOG.info(_LI("bringing vm to original state: '%s'"),
  2479. orig_vm_state, instance=instance)
  2480. instance.vm_state = vm_states.ACTIVE
  2481. instance.task_state = task_states.POWERING_OFF
  2482. instance.progress = 0
  2483. instance.save()
  2484. self.stop_instance(context, instance)
  2485. self._notify_about_instance_usage(
  2486. context, instance, "rebuild.end",
  2487. network_info=network_info,
  2488. extra_usage_info=extra_usage_info)
  2489. def _handle_bad_volumes_detached(self, context, instance, bad_devices,
  2490. block_device_info):
  2491. """Handle cases where the virt-layer had to detach non-working volumes
  2492. in order to complete an operation.
  2493. """
  2494. for bdm in block_device_info['block_device_mapping']:
  2495. if bdm.get('mount_device') in bad_devices:
  2496. try:
  2497. volume_id = bdm['connection_info']['data']['volume_id']
  2498. except KeyError:
  2499. continue
  2500. # NOTE(sirp): ideally we'd just call
  2501. # `compute_api.detach_volume` here but since that hits the
  2502. # DB directly, that's off limits from within the
  2503. # compute-manager.
  2504. #
  2505. # API-detach
  2506. LOG.info(_("Detaching from volume api: %s") % volume_id)
  2507. volume = self.volume_api.get(context, volume_id)
  2508. self.volume_api.check_detach(context, volume)
  2509. self.volume_api.begin_detaching(context, volume_id)
  2510. # Manager-detach
  2511. self.detach_volume(context, volume_id, instance)
  2512. @wrap_exception()
  2513. @reverts_task_state
  2514. @wrap_instance_event
  2515. @wrap_instance_fault
  2516. def reboot_instance(self, context, instance, block_device_info,
  2517. reboot_type):
  2518. """Reboot an instance on this host."""
  2519. # acknowledge the request made it to the manager
  2520. if reboot_type == "SOFT":
  2521. instance.task_state = task_states.REBOOT_PENDING
  2522. expected_states = (task_states.REBOOTING,
  2523. task_states.REBOOT_PENDING,
  2524. task_states.REBOOT_STARTED)
  2525. else:
  2526. instance.task_state = task_states.REBOOT_PENDING_HARD
  2527. expected_states = (task_states.REBOOTING_HARD,
  2528. task_states.REBOOT_PENDING_HARD,
  2529. task_states.REBOOT_STARTED_HARD)
  2530. context = context.elevated()
  2531. LOG.audit(_("Rebooting instance"), context=context, instance=instance)
  2532. block_device_info = self._get_instance_block_device_info(context,
  2533. instance)
  2534. network_info = self._get_instance_nw_info(context, instance)
  2535. self._notify_about_instance_usage(context, instance, "reboot.start")
  2536. current_power_state = self._get_power_state(context, instance)
  2537. instance.power_state = current_power_state
  2538. instance.save(expected_task_state=expected_states)
  2539. if instance['power_state'] != power_state.RUNNING:
  2540. state = instance['power_state']
  2541. running = power_state.RUNNING
  2542. LOG.warn(_('trying to reboot a non-running instance:'
  2543. ' (state: %(state)s expected: %(running)s)'),
  2544. {'state': state, 'running': running},
  2545. context=context, instance=instance)
  2546. def bad_volumes_callback(bad_devices):
  2547. self._handle_bad_volumes_detached(
  2548. context, instance, bad_devices, block_device_info)
  2549. try:
  2550. # Don't change it out of rescue mode
  2551. if instance['vm_state'] == vm_states.RESCUED:
  2552. new_vm_state = vm_states.RESCUED
  2553. else:
  2554. new_vm_state = vm_states.ACTIVE
  2555. new_power_state = None
  2556. if reboot_type == "SOFT":
  2557. instance.task_state = task_states.REBOOT_STARTED
  2558. expected_state = task_states.REBOOT_PENDING
  2559. else:
  2560. instance.task_state = task_states.REBOOT_STARTED_HARD
  2561. expected_state = task_states.REBOOT_PENDING_HARD
  2562. instance.save(expected_task_state=expected_state)
  2563. self.driver.reboot(context, instance,
  2564. network_info,
  2565. reboot_type,
  2566. block_device_info=block_device_info,
  2567. bad_volumes_callback=bad_volumes_callback)
  2568. except Exception as error:
  2569. with excutils.save_and_reraise_exception() as ctxt:
  2570. exc_info = sys.exc_info()
  2571. # if the reboot failed but the VM is running don't
  2572. # put it into an error state
  2573. new_power_state = self._get_power_state(context, instance)
  2574. if new_power_state == power_state.RUNNING:
  2575. LOG.warning(_('Reboot failed but instance is running'),
  2576. context=context, instance=instance)
  2577. compute_utils.add_instance_fault_from_exc(context,
  2578. instance, error, exc_info)
  2579. self._notify_about_instance_usage(context, instance,
  2580. 'reboot.error', fault=error)
  2581. ctxt.reraise = False
  2582. else:
  2583. LOG.error(_LE('Cannot reboot instance: %s'), error,
  2584. context=context, instance=instance)
  2585. self._set_instance_obj_error_state(context, instance)
  2586. if not new_power_state:
  2587. new_power_state = self._get_power_state(context, instance)
  2588. try:
  2589. instance.power_state = new_power_state
  2590. instance.vm_state = new_vm_state
  2591. instance.task_state = None
  2592. instance.save()
  2593. except exception.InstanceNotFound:
  2594. LOG.warn(_("Instance disappeared during reboot"),
  2595. context=context, instance=instance)
  2596. self._notify_about_instance_usage(context, instance, "reboot.end")
  2597. @delete_image_on_error
  2598. def _do_snapshot_instance(self, context, image_id, instance, rotation):
  2599. if rotation < 0:
  2600. raise exception.RotationRequiredForBackup()
  2601. self._snapshot_instance(context, image_id, instance,
  2602. task_states.IMAGE_BACKUP)
  2603. @wrap_exception()
  2604. @reverts_task_state
  2605. @wrap_instance_fault
  2606. def backup_instance(self, context, image_id, instance, backup_type,
  2607. rotation):
  2608. """Backup an instance on this host.
  2609. :param backup_type: daily | weekly
  2610. :param rotation: int representing how many backups to keep around
  2611. """
  2612. self._do_snapshot_instance(context, image_id, instance, rotation)
  2613. self._rotate_backups(context, instance, backup_type, rotation)
  2614. @wrap_exception()
  2615. @reverts_task_state
  2616. @wrap_instance_fault
  2617. @delete_image_on_error
  2618. def snapshot_instance(self, context, image_id, instance):
  2619. """Snapshot an instance on this host.
  2620. :param context: security context
  2621. :param instance: a nova.objects.instance.Instance object
  2622. :param image_id: glance.db.sqlalchemy.models.Image.Id
  2623. """
  2624. # NOTE(dave-mcnally) the task state will already be set by the api
  2625. # but if the compute manager has crashed/been restarted prior to the
  2626. # request getting here the task state may have been cleared so we set
  2627. # it again and things continue normally
  2628. try:
  2629. instance.task_state = task_states.IMAGE_SNAPSHOT
  2630. instance.save(
  2631. expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
  2632. except exception.InstanceNotFound:
  2633. # possibility instance no longer exists, no point in continuing
  2634. LOG.debug("Instance not found, could not set state %s "
  2635. "for instance.",
  2636. task_states.IMAGE_SNAPSHOT, instance=instance)
  2637. return
  2638. except exception.UnexpectedDeletingTaskStateError:
  2639. LOG.debug("Instance being deleted, snapshot cannot continue",
  2640. instance=instance)
  2641. return
  2642. self._snapshot_instance(context, image_id, instance,
  2643. task_states.IMAGE_SNAPSHOT)
  2644. def _snapshot_instance(self, context, image_id, instance,
  2645. expected_task_state):
  2646. context = context.elevated()
  2647. current_power_state = self._get_power_state(context, instance)
  2648. try:
  2649. instance.power_state = current_power_state
  2650. instance.save()
  2651. LOG.audit(_('instance snapshotting'), context=context,
  2652. instance=instance)
  2653. if instance.power_state != power_state.RUNNING:
  2654. state = instance.power_state
  2655. running = power_state.RUNNING
  2656. LOG.warn(_('trying to snapshot a non-running instance: '
  2657. '(state: %(state)s expected: %(running)s)'),
  2658. {'state': state, 'running': running},
  2659. instance=instance)
  2660. self._notify_about_instance_usage(
  2661. context, instance, "snapshot.start")
  2662. def update_task_state(task_state,
  2663. expected_state=expected_task_state):
  2664. instance.task_state = task_state
  2665. instance.save(expected_task_state=expected_state)
  2666. self.driver.snapshot(context, instance, image_id,
  2667. update_task_state)
  2668. instance.task_state = None
  2669. instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
  2670. self._notify_about_instance_usage(context, instance,
  2671. "snapshot.end")
  2672. except (exception.InstanceNotFound,
  2673. exception.UnexpectedDeletingTaskStateError):
  2674. # the instance got deleted during the snapshot
  2675. # Quickly bail out of here
  2676. msg = 'Instance disappeared during snapshot'
  2677. LOG.debug(msg, instance=instance)
  2678. try:
  2679. image_service = glance.get_default_image_service()
  2680. image = image_service.show(context, image_id)
  2681. if image['status'] != 'active':
  2682. image_service.delete(context, image_id)
  2683. except Exception:
  2684. LOG.warning(_("Error while trying to clean up image %s"),
  2685. image_id, instance=instance)
  2686. except exception.ImageNotFound:
  2687. instance.task_state = None
  2688. instance.save()
  2689. msg = _("Image not found during snapshot")
  2690. LOG.warn(msg, instance=instance)
  2691. def _post_interrupted_snapshot_cleanup(self, context, instance):
  2692. self.driver.post_interrupted_snapshot_cleanup(context, instance)
  2693. @object_compat
  2694. @messaging.expected_exceptions(NotImplementedError)
  2695. def volume_snapshot_create(self, context, instance, volume_id,
  2696. create_info):
  2697. self.driver.volume_snapshot_create(context, instance, volume_id,
  2698. create_info)
  2699. @object_compat
  2700. @messaging.expected_exceptions(NotImplementedError)
  2701. def volume_snapshot_delete(self, context, instance, volume_id,
  2702. snapshot_id, delete_info):
  2703. self.driver.volume_snapshot_delete(context, instance, volume_id,
  2704. snapshot_id, delete_info)
  2705. @wrap_instance_fault
  2706. def _rotate_backups(self, context, instance, backup_type, rotation):
  2707. """Delete excess backups associated to an instance.
  2708. Instances are allowed a fixed number of backups (the rotation number);
  2709. this method deletes the oldest backups that exceed the rotation
  2710. threshold.
  2711. :param context: security context
  2712. :param instance: Instance dict
  2713. :param backup_type: daily | weekly
  2714. :param rotation: int representing how many backups to keep around;
  2715. None if rotation shouldn't be used (as in the case of snapshots)
  2716. """
  2717. filters = {'property-image_type': 'backup',
  2718. 'property-backup_type': backup_type,
  2719. 'property-instance_uuid': instance.uuid}
  2720. images = self.image_api.get_all(context, filters=filters,
  2721. sort_key='created_at', sort_dir='desc')
  2722. num_images = len(images)
  2723. LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
  2724. {'num_images': num_images, 'rotation': rotation},
  2725. instance=instance)
  2726. if num_images > rotation:
  2727. # NOTE(sirp): this deletes all backups that exceed the rotation
  2728. # limit
  2729. excess = len(images) - rotation
  2730. LOG.debug("Rotating out %d backups", excess,
  2731. instance=instance)
  2732. for i in xrange(excess):
  2733. image = images.pop()
  2734. image_id = image['id']
  2735. LOG.debug("Deleting image %s", image_id,
  2736. instance=instance)
  2737. self.image_api.delete(context, image_id)
  2738. @object_compat
  2739. @wrap_exception()
  2740. @reverts_task_state
  2741. @wrap_instance_event
  2742. @wrap_instance_fault
  2743. def set_admin_password(self, context, instance, new_pass):
  2744. """Set the root/admin password for an instance on this host.
  2745. This is generally only called by API password resets after an
  2746. image has been built.
  2747. @param context: Nova auth context.
  2748. @param instance: Nova instance object.
  2749. @param new_pass: The admin password for the instance.
  2750. """
  2751. context = context.elevated()
  2752. if new_pass is None:
  2753. # Generate a random password
  2754. new_pass = utils.generate_password()
  2755. current_power_state = self._get_power_state(context, instance)
  2756. expected_state = power_state.RUNNING
  2757. if current_power_state != expected_state:
  2758. instance.task_state = None
  2759. instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
  2760. _msg = _('Failed to set admin password. Instance %s is not'
  2761. ' running') % instance.uuid
  2762. raise exception.InstancePasswordSetFailed(
  2763. instance=instance.uuid, reason=_msg)
  2764. try:
  2765. self.driver.set_admin_password(instance, new_pass)
  2766. LOG.audit(_("Root password set"), instance=instance)
  2767. instance.task_state = None
  2768. instance.save(
  2769. expected_task_state=task_states.UPDATING_PASSWORD)
  2770. except NotImplementedError:
  2771. _msg = _('set_admin_password is not implemented '
  2772. 'by this driver or guest instance.')
  2773. LOG.warn(_msg, instance=instance)
  2774. instance.task_state = None
  2775. instance.save(
  2776. expected_task_state=task_states.UPDATING_PASSWORD)
  2777. raise NotImplementedError(_msg)
  2778. except exception.UnexpectedTaskStateError:
  2779. # interrupted by another (most likely delete) task
  2780. # do not retry
  2781. raise
  2782. except Exception as e:
  2783. # Catch all here because this could be anything.
  2784. LOG.exception(_LE('set_admin_password failed: %s'), e,
  2785. instance=instance)
  2786. self._set_instance_obj_error_state(context, instance)
  2787. # We create a new exception here so that we won't
  2788. # potentially reveal password information to the
  2789. # API caller. The real exception is logged above
  2790. _msg = _('error setting admin password')
  2791. raise exception.InstancePasswordSetFailed(
  2792. instance=instance.uuid, reason=_msg)
  2793. @wrap_exception()
  2794. @reverts_task_state
  2795. @wrap_instance_fault
  2796. def inject_file(self, context, path, file_contents, instance):
  2797. """Write a file to the specified path in an instance on this host."""
  2798. # NOTE(russellb) Remove this method, as well as the underlying virt
  2799. # driver methods, when the compute rpc interface is bumped to 4.x
  2800. # as it is no longer used.
  2801. context = context.elevated()
  2802. current_power_state = self._get_power_state(context, instance)
  2803. expected_state = power_state.RUNNING
  2804. if current_power_state != expected_state:
  2805. LOG.warn(_('trying to inject a file into a non-running (state: '
  2806. '%(current_state)s expected: %(expected_state)s)'),
  2807. {'current_state': current_power_state,
  2808. 'expected_state': expected_state},
  2809. instance=instance)
  2810. LOG.audit(_('injecting file to %s'), path,
  2811. instance=instance)
  2812. self.driver.inject_file(instance, path, file_contents)
  2813. def _get_rescue_image(self, context, instance, rescue_image_ref=None):
  2814. """Determine what image should be used to boot the rescue VM."""
  2815. # 1. If rescue_image_ref is passed in, use that for rescue.
  2816. # 2. Else, use the base image associated with instance's current image.
  2817. # The idea here is to provide the customer with a rescue
  2818. # environment which they are familiar with.
  2819. # So, if they built their instance off of a Debian image,
  2820. # their rescue VM will also be Debian.
  2821. # 3. As a last resort, use instance's current image.
  2822. if not rescue_image_ref:
  2823. system_meta = utils.instance_sys_meta(instance)
  2824. rescue_image_ref = system_meta.get('image_base_image_ref')
  2825. if not rescue_image_ref:
  2826. LOG.warn(_('Unable to find a different image to use for rescue VM,'
  2827. ' using instance\'s current image'), instance=instance)
  2828. rescue_image_ref = instance.image_ref
  2829. image_meta = compute_utils.get_image_metadata(context, self.image_api,
  2830. rescue_image_ref,
  2831. instance)
  2832. # NOTE(belliott) bug #1227350 - xenapi needs the actual image id
  2833. image_meta['id'] = rescue_image_ref
  2834. return image_meta
  2835. @object_compat
  2836. @wrap_exception()
  2837. @reverts_task_state
  2838. @wrap_instance_event
  2839. @wrap_instance_fault
  2840. def rescue_instance(self, context, instance, rescue_password,
  2841. rescue_image_ref=None, clean_shutdown=True):
  2842. context = context.elevated()
  2843. LOG.audit(_('Rescuing'), context=context, instance=instance)
  2844. admin_password = (rescue_password if rescue_password else
  2845. utils.generate_password())
  2846. network_info = self._get_instance_nw_info(context, instance)
  2847. rescue_image_meta = self._get_rescue_image(context, instance,
  2848. rescue_image_ref)
  2849. extra_usage_info = {'rescue_image_name':
  2850. rescue_image_meta.get('name', '')}
  2851. self._notify_about_instance_usage(context, instance,
  2852. "rescue.start", extra_usage_info=extra_usage_info,
  2853. network_info=network_info)
  2854. try:
  2855. self._power_off_instance(context, instance, clean_shutdown)
  2856. self.driver.rescue(context, instance,
  2857. network_info,
  2858. rescue_image_meta, admin_password)
  2859. except Exception as e:
  2860. LOG.exception(_LE("Error trying to Rescue Instance"),
  2861. instance=instance)
  2862. raise exception.InstanceNotRescuable(
  2863. instance_id=instance.uuid,
  2864. reason=_("Driver Error: %s") % e)
  2865. self.conductor_api.notify_usage_exists(context, instance,
  2866. current_period=True)
  2867. current_power_state = self._get_power_state(context, instance)
  2868. instance.vm_state = vm_states.RESCUED
  2869. instance.task_state = None
  2870. instance.power_state = current_power_state
  2871. instance.launched_at = timeutils.utcnow()
  2872. instance.save(expected_task_state=task_states.RESCUING)
  2873. self._notify_about_instance_usage(context, instance,
  2874. "rescue.end", extra_usage_info=extra_usage_info,
  2875. network_info=network_info)
  2876. @object_compat
  2877. @wrap_exception()
  2878. @reverts_task_state
  2879. @wrap_instance_event
  2880. @wrap_instance_fault
  2881. def unrescue_instance(self, context, instance):
  2882. context = context.elevated()
  2883. LOG.audit(_('Unrescuing'), context=context, instance=instance)
  2884. network_info = self._get_instance_nw_info(context, instance)
  2885. self._notify_about_instance_usage(context, instance,
  2886. "unrescue.start", network_info=network_info)
  2887. with self._error_out_instance_on_exception(context, instance):
  2888. self.driver.unrescue(instance,
  2889. network_info)
  2890. current_power_state = self._get_power_state(context, instance)
  2891. instance.vm_state = vm_states.ACTIVE
  2892. instance.task_state = None
  2893. instance.power_state = current_power_state
  2894. instance.save(expected_task_state=task_states.UNRESCUING)
  2895. self._notify_about_instance_usage(context,
  2896. instance,
  2897. "unrescue.end",
  2898. network_info=network_info)
  2899. @object_compat
  2900. @wrap_exception()
  2901. @wrap_instance_fault
  2902. def change_instance_metadata(self, context, diff, instance):
  2903. """Update the metadata published to the instance."""
  2904. LOG.debug("Changing instance metadata according to %r",
  2905. diff, instance=instance)
  2906. self.driver.change_instance_metadata(context, instance, diff)
  2907. def _cleanup_stored_instance_types(self, migration, instance,
  2908. restore_old=False):
  2909. """Clean up "old" and "new" instance_type information stored in
  2910. instance's system_metadata. Optionally update the "current"
  2911. instance_type to the saved old one first.
  2912. Returns the updated system_metadata as a dict, the
  2913. post-cleanup current instance type and the to-be dropped
  2914. instance type.
  2915. """
  2916. sys_meta = instance.system_metadata
  2917. if restore_old:
  2918. instance_type = flavors.extract_flavor(instance, 'old_')
  2919. drop_instance_type = flavors.extract_flavor(instance)
  2920. sys_meta = flavors.save_flavor_info(sys_meta, instance_type)
  2921. else:
  2922. instance_type = flavors.extract_flavor(instance)
  2923. drop_instance_type = flavors.extract_flavor(instance, 'old_')
  2924. flavors.delete_flavor_info(sys_meta, 'old_')
  2925. flavors.delete_flavor_info(sys_meta, 'new_')
  2926. return sys_meta, instance_type, drop_instance_type
  2927. @wrap_exception()
  2928. @wrap_instance_event
  2929. @wrap_instance_fault
  2930. def confirm_resize(self, context, instance, reservations, migration):
  2931. quotas = objects.Quotas.from_reservations(context,
  2932. reservations,
  2933. instance=instance)
  2934. @utils.synchronized(instance['uuid'])
  2935. def do_confirm_resize(context, instance, migration_id):
  2936. # NOTE(wangpan): Get the migration status from db, if it has been
  2937. # confirmed, we do nothing and return here
  2938. LOG.debug("Going to confirm migration %s", migration_id,
  2939. context=context, instance=instance)
  2940. try:
  2941. # TODO(russellb) Why are we sending the migration object just
  2942. # to turn around and look it up from the db again?
  2943. migration = objects.Migration.get_by_id(
  2944. context.elevated(), migration_id)
  2945. except exception.MigrationNotFound:
  2946. LOG.error(_LE("Migration %s is not found during confirmation"),
  2947. migration_id, context=context, instance=instance)
  2948. quotas.rollback()
  2949. return
  2950. if migration.status == 'confirmed':
  2951. LOG.info(_("Migration %s is already confirmed") %
  2952. migration_id, context=context, instance=instance)
  2953. quotas.rollback()
  2954. return
  2955. elif migration.status not in ('finished', 'confirming'):
  2956. LOG.warn(_("Unexpected confirmation status '%(status)s' of "
  2957. "migration %(id)s, exit confirmation process") %
  2958. {"status": migration.status, "id": migration_id},
  2959. context=context, instance=instance)
  2960. quotas.rollback()
  2961. return
  2962. # NOTE(wangpan): Get the instance from db, if it has been
  2963. # deleted, we do nothing and return here
  2964. expected_attrs = ['metadata', 'system_metadata']
  2965. try:
  2966. instance = objects.Instance.get_by_uuid(
  2967. context, instance.uuid,
  2968. expected_attrs=expected_attrs)
  2969. except exception.InstanceNotFound:
  2970. LOG.info(_("Instance is not found during confirmation"),
  2971. context=context, instance=instance)
  2972. quotas.rollback()
  2973. return
  2974. self._confirm_resize(context, instance, quotas,
  2975. migration=migration)
  2976. do_confirm_resize(context, instance, migration.id)
  2977. def _confirm_resize(self, context, instance, quotas,
  2978. migration=None):
  2979. """Destroys the source instance."""
  2980. self._notify_about_instance_usage(context, instance,
  2981. "resize.confirm.start")
  2982. with self._error_out_instance_on_exception(context, instance,
  2983. quotas=quotas):
  2984. # NOTE(danms): delete stashed migration information
  2985. sys_meta, instance_type, old_instance_type = (
  2986. self._cleanup_stored_instance_types(migration, instance))
  2987. sys_meta.pop('old_vm_state', None)
  2988. instance.system_metadata = sys_meta
  2989. instance.save()
  2990. # NOTE(tr3buchet): tear down networks on source host
  2991. self.network_api.setup_networks_on_host(context, instance,
  2992. migration.source_compute, teardown=True)
  2993. network_info = self._get_instance_nw_info(context, instance)
  2994. self.driver.confirm_migration(migration, instance,
  2995. network_info)
  2996. migration.status = 'confirmed'
  2997. migration.save(context.elevated())
  2998. rt = self._get_resource_tracker(migration.source_node)
  2999. rt.drop_resize_claim(context, instance, old_instance_type)
  3000. # NOTE(mriedem): The old_vm_state could be STOPPED but the user
  3001. # might have manually powered up the instance to confirm the
  3002. # resize/migrate, so we need to check the current power state
  3003. # on the instance and set the vm_state appropriately. We default
  3004. # to ACTIVE because if the power state is not SHUTDOWN, we
  3005. # assume _sync_instance_power_state will clean it up.
  3006. p_state = instance.power_state
  3007. vm_state = None
  3008. if p_state == power_state.SHUTDOWN:
  3009. vm_state = vm_states.STOPPED
  3010. LOG.debug("Resized/migrated instance is powered off. "
  3011. "Setting vm_state to '%s'.", vm_state,
  3012. instance=instance)
  3013. else:
  3014. vm_state = vm_states.ACTIVE
  3015. instance.vm_state = vm_state
  3016. instance.task_state = None
  3017. instance.save(expected_task_state=[None, task_states.DELETING])
  3018. self._notify_about_instance_usage(
  3019. context, instance, "resize.confirm.end",
  3020. network_info=network_info)
  3021. quotas.commit()
  3022. @wrap_exception()
  3023. @reverts_task_state
  3024. @wrap_instance_event
  3025. @wrap_instance_fault
  3026. def revert_resize(self, context, instance, migration, reservations):
  3027. """Destroys the new instance on the destination machine.
  3028. Reverts the model changes, and powers on the old instance on the
  3029. source machine.
  3030. """
  3031. quotas = quotas_obj.Quotas.from_reservations(context,
  3032. reservations,
  3033. instance=instance)
  3034. # NOTE(comstud): A revert_resize is essentially a resize back to
  3035. # the old size, so we need to send a usage event here.
  3036. self.conductor_api.notify_usage_exists(
  3037. context, instance, current_period=True)
  3038. with self._error_out_instance_on_exception(context, instance,
  3039. quotas=quotas):
  3040. # NOTE(tr3buchet): tear down networks on destination host
  3041. self.network_api.setup_networks_on_host(context, instance,
  3042. teardown=True)
  3043. instance_p = obj_base.obj_to_primitive(instance)
  3044. migration_p = obj_base.obj_to_primitive(migration)
  3045. self.network_api.migrate_instance_start(context,
  3046. instance_p,
  3047. migration_p)
  3048. network_info = self._get_instance_nw_info(context, instance)
  3049. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3050. context, instance.uuid)
  3051. block_device_info = self._get_instance_block_device_info(
  3052. context, instance, bdms=bdms)
  3053. self.driver.destroy(context, instance, network_info,
  3054. block_device_info)
  3055. self._terminate_volume_connections(context, instance, bdms)
  3056. migration.status = 'reverted'
  3057. migration.save(context.elevated())
  3058. rt = self._get_resource_tracker(instance.node)
  3059. rt.drop_resize_claim(context, instance)
  3060. self.compute_rpcapi.finish_revert_resize(context, instance,
  3061. migration, migration.source_compute,
  3062. quotas.reservations)
  3063. @wrap_exception()
  3064. @reverts_task_state
  3065. @wrap_instance_event
  3066. @wrap_instance_fault
  3067. def finish_revert_resize(self, context, instance, reservations, migration):
  3068. """Finishes the second half of reverting a resize.
  3069. Bring the original source instance state back (active/shutoff) and
  3070. revert the resized attributes in the database.
  3071. """
  3072. quotas = quotas_obj.Quotas.from_reservations(context,
  3073. reservations,
  3074. instance=instance)
  3075. with self._error_out_instance_on_exception(context, instance,
  3076. quotas=quotas):
  3077. network_info = self._get_instance_nw_info(context, instance)
  3078. self._notify_about_instance_usage(
  3079. context, instance, "resize.revert.start")
  3080. sys_meta, instance_type, drop_instance_type = (
  3081. self._cleanup_stored_instance_types(migration, instance, True))
  3082. # NOTE(mriedem): delete stashed old_vm_state information; we
  3083. # default to ACTIVE for backwards compatibility if old_vm_state
  3084. # is not set
  3085. old_vm_state = sys_meta.pop('old_vm_state', vm_states.ACTIVE)
  3086. instance.system_metadata = sys_meta
  3087. instance.memory_mb = instance_type['memory_mb']
  3088. instance.vcpus = instance_type['vcpus']
  3089. instance.root_gb = instance_type['root_gb']
  3090. instance.ephemeral_gb = instance_type['ephemeral_gb']
  3091. instance.instance_type_id = instance_type['id']
  3092. instance.host = migration['source_compute']
  3093. instance.node = migration['source_node']
  3094. instance.save()
  3095. self.network_api.setup_networks_on_host(context, instance,
  3096. migration['source_compute'])
  3097. block_device_info = self._get_instance_block_device_info(
  3098. context, instance, refresh_conn_info=True)
  3099. power_on = old_vm_state != vm_states.STOPPED
  3100. self.driver.finish_revert_migration(context, instance,
  3101. network_info,
  3102. block_device_info, power_on)
  3103. instance.launched_at = timeutils.utcnow()
  3104. instance.save(expected_task_state=task_states.RESIZE_REVERTING)
  3105. instance_p = obj_base.obj_to_primitive(instance)
  3106. migration_p = obj_base.obj_to_primitive(migration)
  3107. self.network_api.migrate_instance_finish(context,
  3108. instance_p,
  3109. migration_p)
  3110. # if the original vm state was STOPPED, set it back to STOPPED
  3111. LOG.info(_("Updating instance to original state: '%s'") %
  3112. old_vm_state)
  3113. if power_on:
  3114. instance.vm_state = vm_states.ACTIVE
  3115. instance.task_state = None
  3116. instance.save()
  3117. else:
  3118. instance.task_state = task_states.POWERING_OFF
  3119. instance.save()
  3120. self.stop_instance(context, instance=instance)
  3121. self._notify_about_instance_usage(
  3122. context, instance, "resize.revert.end")
  3123. quotas.commit()
  3124. def _prep_resize(self, context, image, instance, instance_type,
  3125. quotas, request_spec, filter_properties, node):
  3126. if not filter_properties:
  3127. filter_properties = {}
  3128. if not instance['host']:
  3129. self._set_instance_error_state(context, instance)
  3130. msg = _('Instance has no source host')
  3131. raise exception.MigrationError(msg)
  3132. same_host = instance['host'] == self.host
  3133. if same_host and not CONF.allow_resize_to_same_host:
  3134. self._set_instance_error_state(context, instance)
  3135. msg = _('destination same as source!')
  3136. raise exception.MigrationError(msg)
  3137. # NOTE(danms): Stash the new instance_type to avoid having to
  3138. # look it up in the database later
  3139. sys_meta = instance.system_metadata
  3140. flavors.save_flavor_info(sys_meta, instance_type, prefix='new_')
  3141. # NOTE(mriedem): Stash the old vm_state so we can set the
  3142. # resized/reverted instance back to the same state later.
  3143. vm_state = instance['vm_state']
  3144. LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
  3145. sys_meta['old_vm_state'] = vm_state
  3146. instance.save()
  3147. limits = filter_properties.get('limits', {})
  3148. rt = self._get_resource_tracker(node)
  3149. with rt.resize_claim(context, instance, instance_type,
  3150. image_meta=image, limits=limits) as claim:
  3151. LOG.audit(_('Migrating'), context=context, instance=instance)
  3152. self.compute_rpcapi.resize_instance(
  3153. context, instance, claim.migration, image,
  3154. instance_type, quotas.reservations)
  3155. @wrap_exception()
  3156. @reverts_task_state
  3157. @wrap_instance_event
  3158. @wrap_instance_fault
  3159. def prep_resize(self, context, image, instance, instance_type,
  3160. reservations, request_spec, filter_properties, node):
  3161. """Initiates the process of moving a running instance to another host.
  3162. Possibly changes the RAM and disk size in the process.
  3163. """
  3164. if node is None:
  3165. node = self.driver.get_available_nodes(refresh=True)[0]
  3166. LOG.debug("No node specified, defaulting to %s", node,
  3167. instance=instance)
  3168. quotas = quotas_obj.Quotas.from_reservations(context,
  3169. reservations,
  3170. instance=instance)
  3171. with self._error_out_instance_on_exception(context, instance,
  3172. quotas=quotas):
  3173. self.conductor_api.notify_usage_exists(
  3174. context, instance, current_period=True)
  3175. self._notify_about_instance_usage(
  3176. context, instance, "resize.prep.start")
  3177. try:
  3178. self._prep_resize(context, image, instance,
  3179. instance_type, quotas,
  3180. request_spec, filter_properties,
  3181. node)
  3182. # NOTE(dgenin): This is thrown in LibvirtDriver when the
  3183. # instance to be migrated is backed by LVM.
  3184. # Remove when LVM migration is implemented.
  3185. except exception.MigrationPreCheckError:
  3186. raise
  3187. except Exception:
  3188. # try to re-schedule the resize elsewhere:
  3189. exc_info = sys.exc_info()
  3190. self._reschedule_resize_or_reraise(context, image, instance,
  3191. exc_info, instance_type, quotas, request_spec,
  3192. filter_properties)
  3193. finally:
  3194. extra_usage_info = dict(
  3195. new_instance_type=instance_type['name'],
  3196. new_instance_type_id=instance_type['id'])
  3197. self._notify_about_instance_usage(
  3198. context, instance, "resize.prep.end",
  3199. extra_usage_info=extra_usage_info)
  3200. def _reschedule_resize_or_reraise(self, context, image, instance, exc_info,
  3201. instance_type, quotas, request_spec, filter_properties):
  3202. """Try to re-schedule the resize or re-raise the original error to
  3203. error out the instance.
  3204. """
  3205. if not request_spec:
  3206. request_spec = {}
  3207. if not filter_properties:
  3208. filter_properties = {}
  3209. rescheduled = False
  3210. instance_uuid = instance['uuid']
  3211. try:
  3212. reschedule_method = self.compute_task_api.resize_instance
  3213. scheduler_hint = dict(filter_properties=filter_properties)
  3214. method_args = (instance, None, scheduler_hint, instance_type,
  3215. quotas.reservations)
  3216. task_state = task_states.RESIZE_PREP
  3217. rescheduled = self._reschedule(context, request_spec,
  3218. filter_properties, instance, reschedule_method,
  3219. method_args, task_state, exc_info)
  3220. except Exception as error:
  3221. rescheduled = False
  3222. LOG.exception(_LE("Error trying to reschedule"),
  3223. instance_uuid=instance_uuid)
  3224. compute_utils.add_instance_fault_from_exc(context,
  3225. instance, error,
  3226. exc_info=sys.exc_info())
  3227. self._notify_about_instance_usage(context, instance,
  3228. 'resize.error', fault=error)
  3229. if rescheduled:
  3230. self._log_original_error(exc_info, instance_uuid)
  3231. compute_utils.add_instance_fault_from_exc(context,
  3232. instance, exc_info[1], exc_info=exc_info)
  3233. self._notify_about_instance_usage(context, instance,
  3234. 'resize.error', fault=exc_info[1])
  3235. else:
  3236. # not re-scheduling
  3237. raise exc_info[0], exc_info[1], exc_info[2]
  3238. @wrap_exception()
  3239. @reverts_task_state
  3240. @wrap_instance_event
  3241. @errors_out_migration
  3242. @wrap_instance_fault
  3243. def resize_instance(self, context, instance, image,
  3244. reservations, migration, instance_type,
  3245. clean_shutdown=True):
  3246. """Starts the migration of a running instance to another host."""
  3247. quotas = quotas_obj.Quotas.from_reservations(context,
  3248. reservations,
  3249. instance=instance)
  3250. with self._error_out_instance_on_exception(context, instance,
  3251. quotas=quotas):
  3252. if not instance_type:
  3253. instance_type = objects.Flavor.get_by_id(
  3254. context, migration['new_instance_type_id'])
  3255. network_info = self._get_instance_nw_info(context, instance)
  3256. migration.status = 'migrating'
  3257. migration.save(context.elevated())
  3258. instance.task_state = task_states.RESIZE_MIGRATING
  3259. instance.save(expected_task_state=task_states.RESIZE_PREP)
  3260. self._notify_about_instance_usage(
  3261. context, instance, "resize.start", network_info=network_info)
  3262. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3263. context, instance.uuid)
  3264. block_device_info = self._get_instance_block_device_info(
  3265. context, instance, bdms=bdms)
  3266. timeout, retry_interval = self._get_power_off_values(context,
  3267. instance, clean_shutdown)
  3268. disk_info = self.driver.migrate_disk_and_power_off(
  3269. context, instance, migration.dest_host,
  3270. instance_type, network_info,
  3271. block_device_info,
  3272. timeout, retry_interval)
  3273. self._terminate_volume_connections(context, instance, bdms)
  3274. migration_p = obj_base.obj_to_primitive(migration)
  3275. instance_p = obj_base.obj_to_primitive(instance)
  3276. self.network_api.migrate_instance_start(context,
  3277. instance_p,
  3278. migration_p)
  3279. migration.status = 'post-migrating'
  3280. migration.save(context.elevated())
  3281. instance.host = migration.dest_compute
  3282. instance.node = migration.dest_node
  3283. instance.task_state = task_states.RESIZE_MIGRATED
  3284. instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
  3285. self.compute_rpcapi.finish_resize(context, instance,
  3286. migration, image, disk_info,
  3287. migration.dest_compute, reservations=quotas.reservations)
  3288. self._notify_about_instance_usage(context, instance, "resize.end",
  3289. network_info=network_info)
  3290. self.instance_events.clear_events_for_instance(instance)
  3291. def _terminate_volume_connections(self, context, instance, bdms):
  3292. connector = self.driver.get_volume_connector(instance)
  3293. for bdm in bdms:
  3294. if bdm.is_volume:
  3295. self.volume_api.terminate_connection(context, bdm.volume_id,
  3296. connector)
  3297. @staticmethod
  3298. def _save_instance_info(instance, instance_type, sys_meta):
  3299. flavors.save_flavor_info(sys_meta, instance_type)
  3300. instance.instance_type_id = instance_type['id']
  3301. instance.memory_mb = instance_type['memory_mb']
  3302. instance.vcpus = instance_type['vcpus']
  3303. instance.root_gb = instance_type['root_gb']
  3304. instance.ephemeral_gb = instance_type['ephemeral_gb']
  3305. instance.system_metadata = sys_meta
  3306. instance.save()
  3307. def _finish_resize(self, context, instance, migration, disk_info,
  3308. image):
  3309. resize_instance = False
  3310. old_instance_type_id = migration['old_instance_type_id']
  3311. new_instance_type_id = migration['new_instance_type_id']
  3312. old_instance_type = flavors.extract_flavor(instance)
  3313. sys_meta = instance.system_metadata
  3314. # NOTE(mriedem): Get the old_vm_state so we know if we should
  3315. # power on the instance. If old_vm_state is not set we need to default
  3316. # to ACTIVE for backwards compatibility
  3317. old_vm_state = sys_meta.get('old_vm_state', vm_states.ACTIVE)
  3318. flavors.save_flavor_info(sys_meta,
  3319. old_instance_type,
  3320. prefix='old_')
  3321. if old_instance_type_id != new_instance_type_id:
  3322. instance_type = flavors.extract_flavor(instance, prefix='new_')
  3323. self._save_instance_info(instance, instance_type, sys_meta)
  3324. resize_instance = True
  3325. # NOTE(tr3buchet): setup networks on destination host
  3326. self.network_api.setup_networks_on_host(context, instance,
  3327. migration['dest_compute'])
  3328. instance_p = obj_base.obj_to_primitive(instance)
  3329. migration_p = obj_base.obj_to_primitive(migration)
  3330. self.network_api.migrate_instance_finish(context,
  3331. instance_p,
  3332. migration_p)
  3333. network_info = self._get_instance_nw_info(context, instance)
  3334. instance.task_state = task_states.RESIZE_FINISH
  3335. instance.system_metadata = sys_meta
  3336. instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
  3337. self._notify_about_instance_usage(
  3338. context, instance, "finish_resize.start",
  3339. network_info=network_info)
  3340. block_device_info = self._get_instance_block_device_info(
  3341. context, instance, refresh_conn_info=True)
  3342. # NOTE(mriedem): If the original vm_state was STOPPED, we don't
  3343. # automatically power on the instance after it's migrated
  3344. power_on = old_vm_state != vm_states.STOPPED
  3345. try:
  3346. self.driver.finish_migration(context, migration, instance,
  3347. disk_info,
  3348. network_info,
  3349. image, resize_instance,
  3350. block_device_info, power_on)
  3351. except Exception:
  3352. with excutils.save_and_reraise_exception():
  3353. if resize_instance:
  3354. self._save_instance_info(instance,
  3355. old_instance_type, sys_meta)
  3356. migration.status = 'finished'
  3357. migration.save(context.elevated())
  3358. instance.vm_state = vm_states.RESIZED
  3359. instance.task_state = None
  3360. instance.launched_at = timeutils.utcnow()
  3361. instance.save(expected_task_state=task_states.RESIZE_FINISH)
  3362. self._notify_about_instance_usage(
  3363. context, instance, "finish_resize.end",
  3364. network_info=network_info)
  3365. @wrap_exception()
  3366. @reverts_task_state
  3367. @wrap_instance_event
  3368. @errors_out_migration
  3369. @wrap_instance_fault
  3370. def finish_resize(self, context, disk_info, image, instance,
  3371. reservations, migration):
  3372. """Completes the migration process.
  3373. Sets up the newly transferred disk and turns on the instance at its
  3374. new host machine.
  3375. """
  3376. quotas = quotas_obj.Quotas.from_reservations(context,
  3377. reservations,
  3378. instance=instance)
  3379. try:
  3380. self._finish_resize(context, instance, migration,
  3381. disk_info, image)
  3382. quotas.commit()
  3383. except Exception:
  3384. LOG.exception(_LE('Setting instance vm_state to ERROR'),
  3385. instance=instance)
  3386. with excutils.save_and_reraise_exception():
  3387. try:
  3388. quotas.rollback()
  3389. except Exception as qr_error:
  3390. LOG.exception(_LE("Failed to rollback quota for failed "
  3391. "finish_resize: %s"),
  3392. qr_error, instance=instance)
  3393. self._set_instance_error_state(context, instance)
  3394. @object_compat
  3395. @wrap_exception()
  3396. @wrap_instance_fault
  3397. def add_fixed_ip_to_instance(self, context, network_id, instance):
  3398. """Calls network_api to add new fixed_ip to instance
  3399. then injects the new network info and resets instance networking.
  3400. """
  3401. self._notify_about_instance_usage(
  3402. context, instance, "create_ip.start")
  3403. network_info = self.network_api.add_fixed_ip_to_instance(context,
  3404. instance,
  3405. network_id)
  3406. self._inject_network_info(context, instance, network_info)
  3407. self.reset_network(context, instance)
  3408. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  3409. instance.updated_at = timeutils.utcnow()
  3410. instance.save()
  3411. self._notify_about_instance_usage(
  3412. context, instance, "create_ip.end", network_info=network_info)
  3413. @object_compat
  3414. @wrap_exception()
  3415. @wrap_instance_fault
  3416. def remove_fixed_ip_from_instance(self, context, address, instance):
  3417. """Calls network_api to remove existing fixed_ip from instance
  3418. by injecting the altered network info and resetting
  3419. instance networking.
  3420. """
  3421. self._notify_about_instance_usage(
  3422. context, instance, "delete_ip.start")
  3423. network_info = self.network_api.remove_fixed_ip_from_instance(context,
  3424. instance,
  3425. address)
  3426. self._inject_network_info(context, instance, network_info)
  3427. self.reset_network(context, instance)
  3428. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  3429. instance.updated_at = timeutils.utcnow()
  3430. instance.save()
  3431. self._notify_about_instance_usage(
  3432. context, instance, "delete_ip.end", network_info=network_info)
  3433. @wrap_exception()
  3434. @reverts_task_state
  3435. @wrap_instance_event
  3436. @wrap_instance_fault
  3437. def pause_instance(self, context, instance):
  3438. """Pause an instance on this host."""
  3439. context = context.elevated()
  3440. LOG.audit(_('Pausing'), context=context, instance=instance)
  3441. self._notify_about_instance_usage(context, instance, 'pause.start')
  3442. self.driver.pause(instance)
  3443. current_power_state = self._get_power_state(context, instance)
  3444. instance.power_state = current_power_state
  3445. instance.vm_state = vm_states.PAUSED
  3446. instance.task_state = None
  3447. instance.save(expected_task_state=task_states.PAUSING)
  3448. self._notify_about_instance_usage(context, instance, 'pause.end')
  3449. @wrap_exception()
  3450. @reverts_task_state
  3451. @wrap_instance_event
  3452. @wrap_instance_fault
  3453. def unpause_instance(self, context, instance):
  3454. """Unpause a paused instance on this host."""
  3455. context = context.elevated()
  3456. LOG.audit(_('Unpausing'), context=context, instance=instance)
  3457. self._notify_about_instance_usage(context, instance, 'unpause.start')
  3458. self.driver.unpause(instance)
  3459. current_power_state = self._get_power_state(context, instance)
  3460. instance.power_state = current_power_state
  3461. instance.vm_state = vm_states.ACTIVE
  3462. instance.task_state = None
  3463. instance.save(expected_task_state=task_states.UNPAUSING)
  3464. self._notify_about_instance_usage(context, instance, 'unpause.end')
  3465. @wrap_exception()
  3466. def host_power_action(self, context, action):
  3467. """Reboots, shuts down or powers up the host."""
  3468. # TODO(russellb) Remove the unused host parameter from the driver API
  3469. return self.driver.host_power_action(None, action)
  3470. @wrap_exception()
  3471. def host_maintenance_mode(self, context, host, mode):
  3472. """Start/Stop host maintenance window. On start, it triggers
  3473. guest VMs evacuation.
  3474. """
  3475. return self.driver.host_maintenance_mode(host, mode)
  3476. @wrap_exception()
  3477. def set_host_enabled(self, context, enabled):
  3478. """Sets the specified host's ability to accept new instances."""
  3479. # TODO(russellb) Remove the unused host parameter from the driver API
  3480. return self.driver.set_host_enabled(None, enabled)
  3481. @wrap_exception()
  3482. def get_host_uptime(self, context):
  3483. """Returns the result of calling "uptime" on the target host."""
  3484. return self.driver.get_host_uptime(self.host)
  3485. @object_compat
  3486. @wrap_exception()
  3487. @wrap_instance_fault
  3488. def get_diagnostics(self, context, instance):
  3489. """Retrieve diagnostics for an instance on this host."""
  3490. current_power_state = self._get_power_state(context, instance)
  3491. if current_power_state == power_state.RUNNING:
  3492. LOG.audit(_("Retrieving diagnostics"), context=context,
  3493. instance=instance)
  3494. return self.driver.get_diagnostics(instance)
  3495. else:
  3496. raise exception.InstanceInvalidState(
  3497. attr='power_state',
  3498. instance_uuid=instance.uuid,
  3499. state=instance.power_state,
  3500. method='get_diagnostics')
  3501. @object_compat
  3502. @wrap_exception()
  3503. @wrap_instance_fault
  3504. def get_instance_diagnostics(self, context, instance):
  3505. """Retrieve diagnostics for an instance on this host."""
  3506. current_power_state = self._get_power_state(context, instance)
  3507. if current_power_state == power_state.RUNNING:
  3508. LOG.audit(_("Retrieving diagnostics"), context=context,
  3509. instance=instance)
  3510. diags = self.driver.get_instance_diagnostics(instance)
  3511. return diags.serialize()
  3512. else:
  3513. raise exception.InstanceInvalidState(
  3514. attr='power_state',
  3515. instance_uuid=instance.uuid,
  3516. state=instance.power_state,
  3517. method='get_diagnostics')
  3518. @wrap_exception()
  3519. @reverts_task_state
  3520. @wrap_instance_event
  3521. @wrap_instance_fault
  3522. def suspend_instance(self, context, instance):
  3523. """Suspend the given instance."""
  3524. context = context.elevated()
  3525. # Store the old state
  3526. instance.system_metadata['old_vm_state'] = instance.vm_state
  3527. with self._error_out_instance_on_exception(context, instance,
  3528. instance_state=instance.vm_state):
  3529. self.driver.suspend(instance)
  3530. current_power_state = self._get_power_state(context, instance)
  3531. instance.power_state = current_power_state
  3532. instance.vm_state = vm_states.SUSPENDED
  3533. instance.task_state = None
  3534. instance.save(expected_task_state=task_states.SUSPENDING)
  3535. self._notify_about_instance_usage(context, instance, 'suspend')
  3536. @wrap_exception()
  3537. @reverts_task_state
  3538. @wrap_instance_event
  3539. @wrap_instance_fault
  3540. def resume_instance(self, context, instance):
  3541. """Resume the given suspended instance."""
  3542. context = context.elevated()
  3543. LOG.audit(_('Resuming'), context=context, instance=instance)
  3544. network_info = self._get_instance_nw_info(context, instance)
  3545. block_device_info = self._get_instance_block_device_info(
  3546. context, instance)
  3547. self.driver.resume(context, instance, network_info,
  3548. block_device_info)
  3549. instance.power_state = self._get_power_state(context, instance)
  3550. # We default to the ACTIVE state for backwards compatibility
  3551. instance.vm_state = instance.system_metadata.pop('old_vm_state',
  3552. vm_states.ACTIVE)
  3553. instance.task_state = None
  3554. instance.save(expected_task_state=task_states.RESUMING)
  3555. self._notify_about_instance_usage(context, instance, 'resume')
  3556. @wrap_exception()
  3557. @reverts_task_state
  3558. @wrap_instance_event
  3559. @wrap_instance_fault
  3560. def shelve_instance(self, context, instance, image_id,
  3561. clean_shutdown=True):
  3562. """Shelve an instance.
  3563. This should be used when you want to take a snapshot of the instance.
  3564. It also adds system_metadata that can be used by a periodic task to
  3565. offload the shelved instance after a period of time.
  3566. :param context: request context
  3567. :param instance: an Instance object
  3568. :param image_id: an image id to snapshot to.
  3569. """
  3570. self.conductor_api.notify_usage_exists(
  3571. context, obj_base.obj_to_primitive(instance),
  3572. current_period=True)
  3573. self._notify_about_instance_usage(context, instance, 'shelve.start')
  3574. def update_task_state(task_state, expected_state=task_states.SHELVING):
  3575. shelving_state_map = {
  3576. task_states.IMAGE_PENDING_UPLOAD:
  3577. task_states.SHELVING_IMAGE_PENDING_UPLOAD,
  3578. task_states.IMAGE_UPLOADING:
  3579. task_states.SHELVING_IMAGE_UPLOADING,
  3580. task_states.SHELVING: task_states.SHELVING}
  3581. task_state = shelving_state_map[task_state]
  3582. expected_state = shelving_state_map[expected_state]
  3583. instance.task_state = task_state
  3584. instance.save(expected_task_state=expected_state)
  3585. self._power_off_instance(context, instance, clean_shutdown)
  3586. current_power_state = self._get_power_state(context, instance)
  3587. self.driver.snapshot(context, instance, image_id, update_task_state)
  3588. instance.system_metadata['shelved_at'] = timeutils.strtime()
  3589. instance.system_metadata['shelved_image_id'] = image_id
  3590. instance.system_metadata['shelved_host'] = self.host
  3591. instance.vm_state = vm_states.SHELVED
  3592. instance.task_state = None
  3593. if CONF.shelved_offload_time == 0:
  3594. instance.task_state = task_states.SHELVING_OFFLOADING
  3595. instance.power_state = current_power_state
  3596. instance.save(expected_task_state=[
  3597. task_states.SHELVING,
  3598. task_states.SHELVING_IMAGE_UPLOADING])
  3599. self._notify_about_instance_usage(context, instance, 'shelve.end')
  3600. if CONF.shelved_offload_time == 0:
  3601. self.shelve_offload_instance(context, instance)
  3602. @wrap_exception()
  3603. @reverts_task_state
  3604. @wrap_instance_fault
  3605. def shelve_offload_instance(self, context, instance):
  3606. """Remove a shelved instance from the hypervisor.
  3607. This frees up those resources for use by other instances, but may lead
  3608. to slower unshelve times for this instance. This method is used by
  3609. volume backed instances since restoring them doesn't involve the
  3610. potentially large download of an image.
  3611. :param context: request context
  3612. :param instance: nova.objects.instance.Instance
  3613. """
  3614. self._notify_about_instance_usage(context, instance,
  3615. 'shelve_offload.start')
  3616. self.driver.power_off(instance)
  3617. current_power_state = self._get_power_state(context, instance)
  3618. network_info = self._get_instance_nw_info(context, instance)
  3619. block_device_info = self._get_instance_block_device_info(context,
  3620. instance)
  3621. self.driver.destroy(context, instance, network_info,
  3622. block_device_info)
  3623. instance.power_state = current_power_state
  3624. instance.host = None
  3625. instance.node = None
  3626. instance.vm_state = vm_states.SHELVED_OFFLOADED
  3627. instance.task_state = None
  3628. instance.save(expected_task_state=[task_states.SHELVING,
  3629. task_states.SHELVING_OFFLOADING])
  3630. self._notify_about_instance_usage(context, instance,
  3631. 'shelve_offload.end')
  3632. @wrap_exception()
  3633. @reverts_task_state
  3634. @wrap_instance_event
  3635. @wrap_instance_fault
  3636. def unshelve_instance(self, context, instance, image,
  3637. filter_properties=None, node=None):
  3638. """Unshelve the instance.
  3639. :param context: request context
  3640. :param instance: a nova.objects.instance.Instance object
  3641. :param image: an image to build from. If None we assume a
  3642. volume backed instance.
  3643. :param filter_properties: dict containing limits, retry info etc.
  3644. :param node: target compute node
  3645. """
  3646. if filter_properties is None:
  3647. filter_properties = {}
  3648. @utils.synchronized(instance['uuid'])
  3649. def do_unshelve_instance():
  3650. self._unshelve_instance(context, instance, image,
  3651. filter_properties, node)
  3652. do_unshelve_instance()
  3653. def _unshelve_instance_key_scrub(self, instance):
  3654. """Remove data from the instance that may cause side effects."""
  3655. cleaned_keys = dict(
  3656. key_data=instance.key_data,
  3657. auto_disk_config=instance.auto_disk_config)
  3658. instance.key_data = None
  3659. instance.auto_disk_config = False
  3660. return cleaned_keys
  3661. def _unshelve_instance_key_restore(self, instance, keys):
  3662. """Restore previously scrubbed keys before saving the instance."""
  3663. instance.update(keys)
  3664. def _unshelve_instance(self, context, instance, image, filter_properties,
  3665. node):
  3666. self._notify_about_instance_usage(context, instance, 'unshelve.start')
  3667. instance.task_state = task_states.SPAWNING
  3668. instance.save()
  3669. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3670. context, instance.uuid)
  3671. block_device_info = self._prep_block_device(context, instance, bdms,
  3672. do_check_attach=False)
  3673. scrubbed_keys = self._unshelve_instance_key_scrub(instance)
  3674. if node is None:
  3675. node = self.driver.get_available_nodes()[0]
  3676. LOG.debug('No node specified, defaulting to %s', node,
  3677. instance=instance)
  3678. rt = self._get_resource_tracker(node)
  3679. limits = filter_properties.get('limits', {})
  3680. if image:
  3681. shelved_image_ref = instance.image_ref
  3682. instance.image_ref = image['id']
  3683. self.network_api.migrate_instance_finish(context, instance,
  3684. {'source_compute': '', 'dest_compute': self.host})
  3685. network_info = self._get_instance_nw_info(context, instance)
  3686. try:
  3687. with rt.instance_claim(context, instance, limits):
  3688. self.driver.spawn(context, instance, image, injected_files=[],
  3689. admin_password=None,
  3690. network_info=network_info,
  3691. block_device_info=block_device_info)
  3692. except Exception:
  3693. with excutils.save_and_reraise_exception():
  3694. LOG.exception(_LE('Instance failed to spawn'),
  3695. instance=instance)
  3696. if image:
  3697. instance.image_ref = shelved_image_ref
  3698. self.image_api.delete(context, image['id'])
  3699. self._unshelve_instance_key_restore(instance, scrubbed_keys)
  3700. instance.power_state = self._get_power_state(context, instance)
  3701. instance.vm_state = vm_states.ACTIVE
  3702. instance.task_state = None
  3703. instance.launched_at = timeutils.utcnow()
  3704. instance.save(expected_task_state=task_states.SPAWNING)
  3705. self._notify_about_instance_usage(context, instance, 'unshelve.end')
  3706. @messaging.expected_exceptions(NotImplementedError)
  3707. @wrap_instance_fault
  3708. def reset_network(self, context, instance):
  3709. """Reset networking on the given instance."""
  3710. LOG.debug('Reset network', context=context, instance=instance)
  3711. self.driver.reset_network(instance)
  3712. def _inject_network_info(self, context, instance, network_info):
  3713. """Inject network info for the given instance."""
  3714. LOG.debug('Inject network info', context=context, instance=instance)
  3715. LOG.debug('network_info to inject: |%s|', network_info,
  3716. instance=instance)
  3717. self.driver.inject_network_info(instance,
  3718. network_info)
  3719. @wrap_instance_fault
  3720. def inject_network_info(self, context, instance):
  3721. """Inject network info, but don't return the info."""
  3722. network_info = self._get_instance_nw_info(context, instance)
  3723. self._inject_network_info(context, instance, network_info)
  3724. @object_compat
  3725. @messaging.expected_exceptions(NotImplementedError,
  3726. exception.InstanceNotFound)
  3727. @wrap_exception()
  3728. @wrap_instance_fault
  3729. def get_console_output(self, context, instance, tail_length):
  3730. """Send the console output for the given instance."""
  3731. context = context.elevated()
  3732. LOG.audit(_("Get console output"), context=context,
  3733. instance=instance)
  3734. output = self.driver.get_console_output(context, instance)
  3735. if tail_length is not None:
  3736. output = self._tail_log(output, tail_length)
  3737. return output.decode('utf-8', 'replace').encode('ascii', 'replace')
  3738. def _tail_log(self, log, length):
  3739. try:
  3740. length = int(length)
  3741. except ValueError:
  3742. length = 0
  3743. if length == 0:
  3744. return ''
  3745. else:
  3746. return '\n'.join(log.split('\n')[-int(length):])
  3747. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  3748. exception.InstanceNotReady,
  3749. exception.InstanceNotFound,
  3750. exception.ConsoleTypeUnavailable,
  3751. NotImplementedError)
  3752. @object_compat
  3753. @wrap_exception()
  3754. @wrap_instance_fault
  3755. def get_vnc_console(self, context, console_type, instance):
  3756. """Return connection information for a vnc console."""
  3757. context = context.elevated()
  3758. LOG.debug("Getting vnc console", instance=instance)
  3759. token = str(uuid.uuid4())
  3760. if not CONF.vnc_enabled:
  3761. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  3762. if console_type == 'novnc':
  3763. # For essex, novncproxy_base_url must include the full path
  3764. # including the html file (like http://myhost/vnc_auto.html)
  3765. access_url = '%s?token=%s' % (CONF.novncproxy_base_url, token)
  3766. elif console_type == 'xvpvnc':
  3767. access_url = '%s?token=%s' % (CONF.xvpvncproxy_base_url, token)
  3768. else:
  3769. raise exception.ConsoleTypeInvalid(console_type=console_type)
  3770. try:
  3771. # Retrieve connect info from driver, and then decorate with our
  3772. # access info token
  3773. console = self.driver.get_vnc_console(context, instance)
  3774. connect_info = console.get_connection_info(token, access_url)
  3775. except exception.InstanceNotFound:
  3776. if instance['vm_state'] != vm_states.BUILDING:
  3777. raise
  3778. raise exception.InstanceNotReady(instance_id=instance['uuid'])
  3779. return connect_info
  3780. @object_compat
  3781. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  3782. exception.InstanceNotReady,
  3783. exception.InstanceNotFound,
  3784. exception.ConsoleTypeUnavailable)
  3785. @wrap_exception()
  3786. @wrap_instance_fault
  3787. def get_spice_console(self, context, console_type, instance):
  3788. """Return connection information for a spice console."""
  3789. context = context.elevated()
  3790. LOG.debug("Getting spice console", instance=instance)
  3791. token = str(uuid.uuid4())
  3792. if not CONF.spice.enabled:
  3793. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  3794. if console_type == 'spice-html5':
  3795. # For essex, spicehtml5proxy_base_url must include the full path
  3796. # including the html file (like http://myhost/spice_auto.html)
  3797. access_url = '%s?token=%s' % (CONF.spice.html5proxy_base_url,
  3798. token)
  3799. else:
  3800. raise exception.ConsoleTypeInvalid(console_type=console_type)
  3801. try:
  3802. # Retrieve connect info from driver, and then decorate with our
  3803. # access info token
  3804. console = self.driver.get_spice_console(context, instance)
  3805. connect_info = console.get_connection_info(token, access_url)
  3806. except exception.InstanceNotFound:
  3807. if instance['vm_state'] != vm_states.BUILDING:
  3808. raise
  3809. raise exception.InstanceNotReady(instance_id=instance['uuid'])
  3810. return connect_info
  3811. @object_compat
  3812. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  3813. exception.InstanceNotReady,
  3814. exception.InstanceNotFound,
  3815. exception.ConsoleTypeUnavailable,
  3816. NotImplementedError)
  3817. @wrap_exception()
  3818. @wrap_instance_fault
  3819. def get_rdp_console(self, context, console_type, instance):
  3820. """Return connection information for a RDP console."""
  3821. context = context.elevated()
  3822. LOG.debug("Getting RDP console", instance=instance)
  3823. token = str(uuid.uuid4())
  3824. if not CONF.rdp.enabled:
  3825. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  3826. if console_type == 'rdp-html5':
  3827. access_url = '%s?token=%s' % (CONF.rdp.html5_proxy_base_url,
  3828. token)
  3829. else:
  3830. raise exception.ConsoleTypeInvalid(console_type=console_type)
  3831. try:
  3832. # Retrieve connect info from driver, and then decorate with our
  3833. # access info token
  3834. console = self.driver.get_rdp_console(context, instance)
  3835. connect_info = console.get_connection_info(token, access_url)
  3836. except exception.InstanceNotFound:
  3837. if instance['vm_state'] != vm_states.BUILDING:
  3838. raise
  3839. raise exception.InstanceNotReady(instance_id=instance['uuid'])
  3840. return connect_info
  3841. @messaging.expected_exceptions(
  3842. exception.ConsoleTypeInvalid,
  3843. exception.InstanceNotReady,
  3844. exception.InstanceNotFound,
  3845. exception.ConsoleTypeUnavailable,
  3846. exception.SocketPortRangeExhaustedException,
  3847. exception.ImageSerialPortNumberInvalid,
  3848. exception.ImageSerialPortNumberExceedFlavorValue,
  3849. NotImplementedError)
  3850. @wrap_exception()
  3851. @wrap_instance_fault
  3852. def get_serial_console(self, context, console_type, instance):
  3853. """Returns connection information for a serial console."""
  3854. LOG.debug("Getting serial console", instance=instance)
  3855. if not CONF.serial_console.enabled:
  3856. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  3857. context = context.elevated()
  3858. token = str(uuid.uuid4())
  3859. access_url = '%s?token=%s' % (CONF.serial_console.base_url, token)
  3860. try:
  3861. # Retrieve connect info from driver, and then decorate with our
  3862. # access info token
  3863. console = self.driver.get_serial_console(context, instance)
  3864. connect_info = console.get_connection_info(token, access_url)
  3865. except exception.InstanceNotFound:
  3866. if instance.vm_state != vm_states.BUILDING:
  3867. raise
  3868. raise exception.InstanceNotReady(instance_id=instance['uuid'])
  3869. return connect_info
  3870. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  3871. exception.InstanceNotReady,
  3872. exception.InstanceNotFound)
  3873. @object_compat
  3874. @wrap_exception()
  3875. @wrap_instance_fault
  3876. def validate_console_port(self, ctxt, instance, port, console_type):
  3877. if console_type == "spice-html5":
  3878. console_info = self.driver.get_spice_console(ctxt, instance)
  3879. elif console_type == "rdp-html5":
  3880. console_info = self.driver.get_rdp_console(ctxt, instance)
  3881. elif console_type == "serial":
  3882. console_info = self.driver.get_serial_console(ctxt, instance)
  3883. else:
  3884. console_info = self.driver.get_vnc_console(ctxt, instance)
  3885. return console_info.port == port
  3886. @object_compat
  3887. @wrap_exception()
  3888. @reverts_task_state
  3889. @wrap_instance_fault
  3890. def reserve_block_device_name(self, context, instance, device,
  3891. volume_id, disk_bus=None, device_type=None,
  3892. return_bdm_object=False):
  3893. # NOTE(ndipanov): disk_bus and device_type will be set to None if not
  3894. # passed (by older clients) and defaulted by the virt driver. Remove
  3895. # default values on the next major RPC version bump.
  3896. @utils.synchronized(instance['uuid'])
  3897. def do_reserve():
  3898. bdms = (
  3899. objects.BlockDeviceMappingList.get_by_instance_uuid(
  3900. context, instance.uuid))
  3901. device_name = compute_utils.get_device_name_for_instance(
  3902. context, instance, bdms, device)
  3903. # NOTE(vish): create bdm here to avoid race condition
  3904. bdm = objects.BlockDeviceMapping(
  3905. source_type='volume', destination_type='volume',
  3906. instance_uuid=instance.uuid,
  3907. volume_id=volume_id or 'reserved',
  3908. device_name=device_name,
  3909. disk_bus=disk_bus, device_type=device_type)
  3910. bdm.create(context)
  3911. if return_bdm_object:
  3912. return bdm
  3913. else:
  3914. return device_name
  3915. return do_reserve()
  3916. @object_compat
  3917. @wrap_exception()
  3918. @reverts_task_state
  3919. @wrap_instance_fault
  3920. def attach_volume(self, context, volume_id, mountpoint,
  3921. instance, bdm=None):
  3922. """Attach a volume to an instance."""
  3923. if not bdm:
  3924. bdm = objects.BlockDeviceMapping.get_by_volume_id(
  3925. context, volume_id)
  3926. driver_bdm = driver_block_device.DriverVolumeBlockDevice(bdm)
  3927. @utils.synchronized(instance.uuid)
  3928. def do_attach_volume(context, instance, driver_bdm):
  3929. try:
  3930. return self._attach_volume(context, instance, driver_bdm)
  3931. except Exception:
  3932. with excutils.save_and_reraise_exception():
  3933. bdm.destroy(context)
  3934. do_attach_volume(context, instance, driver_bdm)
  3935. def _attach_volume(self, context, instance, bdm):
  3936. context = context.elevated()
  3937. LOG.audit(_('Attaching volume %(volume_id)s to %(mountpoint)s'),
  3938. {'volume_id': bdm.volume_id,
  3939. 'mountpoint': bdm['mount_device']},
  3940. context=context, instance=instance)
  3941. try:
  3942. bdm.attach(context, instance, self.volume_api, self.driver,
  3943. do_check_attach=False, do_driver_attach=True)
  3944. except Exception: # pylint: disable=W0702
  3945. with excutils.save_and_reraise_exception():
  3946. LOG.exception(_LE("Failed to attach %(volume_id)s "
  3947. "at %(mountpoint)s"),
  3948. {'volume_id': bdm.volume_id,
  3949. 'mountpoint': bdm['mount_device']},
  3950. context=context, instance=instance)
  3951. self.volume_api.unreserve_volume(context, bdm.volume_id)
  3952. info = {'volume_id': bdm.volume_id}
  3953. self._notify_about_instance_usage(
  3954. context, instance, "volume.attach", extra_usage_info=info)
  3955. def _detach_volume(self, context, instance, bdm):
  3956. """Do the actual driver detach using block device mapping."""
  3957. mp = bdm.device_name
  3958. volume_id = bdm.volume_id
  3959. LOG.audit(_('Detach volume %(volume_id)s from mountpoint %(mp)s'),
  3960. {'volume_id': volume_id, 'mp': mp},
  3961. context=context, instance=instance)
  3962. connection_info = jsonutils.loads(bdm.connection_info)
  3963. # NOTE(vish): We currently don't use the serial when disconnecting,
  3964. # but added for completeness in case we ever do.
  3965. if connection_info and 'serial' not in connection_info:
  3966. connection_info['serial'] = volume_id
  3967. try:
  3968. if not self.driver.instance_exists(instance):
  3969. LOG.warn(_('Detaching volume from unknown instance'),
  3970. context=context, instance=instance)
  3971. encryption = encryptors.get_encryption_metadata(
  3972. context, self.volume_api, volume_id, connection_info)
  3973. self.driver.detach_volume(connection_info,
  3974. instance,
  3975. mp,
  3976. encryption=encryption)
  3977. except exception.DiskNotFound as err:
  3978. LOG.warn(_LW('Ignoring DiskNotFound exception while detaching '
  3979. 'volume %(volume_id)s from %(mp)s: %(err)s'),
  3980. {'volume_id': volume_id, 'mp': mp, 'err': err},
  3981. instance=instance)
  3982. except Exception: # pylint: disable=W0702
  3983. with excutils.save_and_reraise_exception():
  3984. LOG.exception(_LE('Failed to detach volume %(volume_id)s '
  3985. 'from %(mp)s'),
  3986. {'volume_id': volume_id, 'mp': mp},
  3987. context=context, instance=instance)
  3988. self.volume_api.roll_detaching(context, volume_id)
  3989. @object_compat
  3990. @wrap_exception()
  3991. @reverts_task_state
  3992. @wrap_instance_fault
  3993. def detach_volume(self, context, volume_id, instance):
  3994. """Detach a volume from an instance."""
  3995. bdm = objects.BlockDeviceMapping.get_by_volume_id(
  3996. context, volume_id)
  3997. if CONF.volume_usage_poll_interval > 0:
  3998. vol_stats = []
  3999. mp = bdm.device_name
  4000. # Handle bootable volumes which will not contain /dev/
  4001. if '/dev/' in mp:
  4002. mp = mp[5:]
  4003. try:
  4004. vol_stats = self.driver.block_stats(instance.name, mp)
  4005. except NotImplementedError:
  4006. pass
  4007. if vol_stats:
  4008. LOG.debug("Updating volume usage cache with totals",
  4009. instance=instance)
  4010. rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats
  4011. self.conductor_api.vol_usage_update(context, volume_id,
  4012. rd_req, rd_bytes,
  4013. wr_req, wr_bytes,
  4014. instance,
  4015. update_totals=True)
  4016. self._detach_volume(context, instance, bdm)
  4017. connector = self.driver.get_volume_connector(instance)
  4018. self.volume_api.terminate_connection(context, volume_id, connector)
  4019. bdm.destroy()
  4020. info = dict(volume_id=volume_id)
  4021. self._notify_about_instance_usage(
  4022. context, instance, "volume.detach", extra_usage_info=info)
  4023. self.volume_api.detach(context.elevated(), volume_id)
  4024. def _init_volume_connection(self, context, new_volume_id,
  4025. old_volume_id, connector, instance, bdm):
  4026. new_cinfo = self.volume_api.initialize_connection(context,
  4027. new_volume_id,
  4028. connector)
  4029. old_cinfo = jsonutils.loads(bdm['connection_info'])
  4030. if old_cinfo and 'serial' not in old_cinfo:
  4031. old_cinfo['serial'] = old_volume_id
  4032. new_cinfo['serial'] = old_cinfo['serial']
  4033. return (old_cinfo, new_cinfo)
  4034. def _swap_volume(self, context, instance, bdm, connector, old_volume_id,
  4035. new_volume_id):
  4036. mountpoint = bdm['device_name']
  4037. failed = False
  4038. new_cinfo = None
  4039. resize_to = 0
  4040. try:
  4041. old_cinfo, new_cinfo = self._init_volume_connection(context,
  4042. new_volume_id,
  4043. old_volume_id,
  4044. connector,
  4045. instance,
  4046. bdm)
  4047. old_vol_size = self.volume_api.get(context, old_volume_id)['size']
  4048. new_vol_size = self.volume_api.get(context, new_volume_id)['size']
  4049. if new_vol_size > old_vol_size:
  4050. resize_to = new_vol_size
  4051. self.driver.swap_volume(old_cinfo, new_cinfo, instance, mountpoint,
  4052. resize_to)
  4053. except Exception: # pylint: disable=W0702
  4054. failed = True
  4055. with excutils.save_and_reraise_exception():
  4056. if new_cinfo:
  4057. msg = _LE("Failed to swap volume %(old_volume_id)s "
  4058. "for %(new_volume_id)s")
  4059. LOG.exception(msg, {'old_volume_id': old_volume_id,
  4060. 'new_volume_id': new_volume_id},
  4061. context=context,
  4062. instance=instance)
  4063. else:
  4064. msg = _LE("Failed to connect to volume %(volume_id)s "
  4065. "with volume at %(mountpoint)s")
  4066. LOG.exception(msg, {'volume_id': new_volume_id,
  4067. 'mountpoint': bdm['device_name']},
  4068. context=context,
  4069. instance=instance)
  4070. self.volume_api.roll_detaching(context, old_volume_id)
  4071. self.volume_api.unreserve_volume(context, new_volume_id)
  4072. finally:
  4073. conn_volume = new_volume_id if failed else old_volume_id
  4074. if new_cinfo:
  4075. self.volume_api.terminate_connection(context,
  4076. conn_volume,
  4077. connector)
  4078. # If Cinder initiated the swap, it will keep
  4079. # the original ID
  4080. comp_ret = self.volume_api.migrate_volume_completion(
  4081. context,
  4082. old_volume_id,
  4083. new_volume_id,
  4084. error=failed)
  4085. return (comp_ret, new_cinfo)
  4086. @wrap_exception()
  4087. @reverts_task_state
  4088. @wrap_instance_fault
  4089. def swap_volume(self, context, old_volume_id, new_volume_id, instance):
  4090. """Swap volume for an instance."""
  4091. context = context.elevated()
  4092. bdm = objects.BlockDeviceMapping.get_by_volume_id(
  4093. context, old_volume_id, instance_uuid=instance.uuid)
  4094. connector = self.driver.get_volume_connector(instance)
  4095. comp_ret, new_cinfo = self._swap_volume(context, instance,
  4096. bdm,
  4097. connector,
  4098. old_volume_id,
  4099. new_volume_id)
  4100. save_volume_id = comp_ret['save_volume_id']
  4101. # Update bdm
  4102. values = {
  4103. 'connection_info': jsonutils.dumps(new_cinfo),
  4104. 'delete_on_termination': False,
  4105. 'source_type': 'volume',
  4106. 'destination_type': 'volume',
  4107. 'snapshot_id': None,
  4108. 'volume_id': save_volume_id,
  4109. 'volume_size': None,
  4110. 'no_device': None}
  4111. bdm.update(values)
  4112. bdm.save()
  4113. @wrap_exception()
  4114. def remove_volume_connection(self, context, volume_id, instance):
  4115. """Remove a volume connection using the volume api."""
  4116. # NOTE(vish): We don't want to actually mark the volume
  4117. # detached, or delete the bdm, just remove the
  4118. # connection from this host.
  4119. # NOTE(PhilDay): Can't use object_compat decorator here as
  4120. # instance is not the second parameter
  4121. if isinstance(instance, dict):
  4122. metas = ['metadata', 'system_metadata']
  4123. instance = objects.Instance._from_db_object(
  4124. context, objects.Instance(), instance,
  4125. expected_attrs=metas)
  4126. instance._context = context
  4127. try:
  4128. bdm = objects.BlockDeviceMapping.get_by_volume_id(
  4129. context, volume_id)
  4130. self._detach_volume(context, instance, bdm)
  4131. connector = self.driver.get_volume_connector(instance)
  4132. self.volume_api.terminate_connection(context, volume_id, connector)
  4133. except exception.NotFound:
  4134. pass
  4135. @object_compat
  4136. @wrap_exception()
  4137. @reverts_task_state
  4138. @wrap_instance_fault
  4139. def attach_interface(self, context, instance, network_id, port_id,
  4140. requested_ip):
  4141. """Use hotplug to add an network adapter to an instance."""
  4142. network_info = self.network_api.allocate_port_for_instance(
  4143. context, instance, port_id, network_id, requested_ip)
  4144. if len(network_info) != 1:
  4145. LOG.error(_LE('allocate_port_for_instance returned %(ports)s '
  4146. 'ports'), dict(ports=len(network_info)))
  4147. raise exception.InterfaceAttachFailed(
  4148. instance_uuid=instance.uuid)
  4149. image_ref = instance.get('image_ref')
  4150. image_meta = compute_utils.get_image_metadata(
  4151. context, self.image_api, image_ref, instance)
  4152. self.driver.attach_interface(instance, image_meta, network_info[0])
  4153. return network_info[0]
  4154. @object_compat
  4155. @wrap_exception()
  4156. @reverts_task_state
  4157. @wrap_instance_fault
  4158. def detach_interface(self, context, instance, port_id):
  4159. """Detach an network adapter from an instance."""
  4160. network_info = instance.info_cache.network_info
  4161. condemned = None
  4162. for vif in network_info:
  4163. if vif['id'] == port_id:
  4164. condemned = vif
  4165. break
  4166. if condemned is None:
  4167. raise exception.PortNotFound(_("Port %s is not "
  4168. "attached") % port_id)
  4169. self.network_api.deallocate_port_for_instance(context, instance,
  4170. port_id)
  4171. self.driver.detach_interface(instance, condemned)
  4172. def _get_compute_info(self, context, host):
  4173. service = objects.Service.get_by_compute_host(context, host)
  4174. try:
  4175. return service.compute_node
  4176. except IndexError:
  4177. raise exception.NotFound(_("Host %s not found") % host)
  4178. @wrap_exception()
  4179. def check_instance_shared_storage(self, ctxt, instance, data):
  4180. """Check if the instance files are shared
  4181. :param context: security context
  4182. :param data: result of driver.check_instance_shared_storage_local
  4183. Returns True if instance disks located on shared storage and
  4184. False otherwise.
  4185. """
  4186. return self.driver.check_instance_shared_storage_remote(ctxt, data)
  4187. @wrap_exception()
  4188. @wrap_instance_fault
  4189. def check_can_live_migrate_destination(self, ctxt, instance,
  4190. block_migration, disk_over_commit):
  4191. """Check if it is possible to execute live migration.
  4192. This runs checks on the destination host, and then calls
  4193. back to the source host to check the results.
  4194. :param context: security context
  4195. :param instance: dict of instance data
  4196. :param block_migration: if true, prepare for block migration
  4197. :param disk_over_commit: if true, allow disk over commit
  4198. :returns: a dict containing migration info
  4199. """
  4200. src_compute_info = obj_base.obj_to_primitive(
  4201. self._get_compute_info(ctxt, instance.host))
  4202. dst_compute_info = obj_base.obj_to_primitive(
  4203. self._get_compute_info(ctxt, CONF.host))
  4204. dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,
  4205. instance, src_compute_info, dst_compute_info,
  4206. block_migration, disk_over_commit)
  4207. migrate_data = {}
  4208. try:
  4209. migrate_data = self.compute_rpcapi.\
  4210. check_can_live_migrate_source(ctxt, instance,
  4211. dest_check_data)
  4212. finally:
  4213. self.driver.check_can_live_migrate_destination_cleanup(ctxt,
  4214. dest_check_data)
  4215. if 'migrate_data' in dest_check_data:
  4216. migrate_data.update(dest_check_data['migrate_data'])
  4217. return migrate_data
  4218. @wrap_exception()
  4219. @wrap_instance_fault
  4220. def check_can_live_migrate_source(self, ctxt, instance, dest_check_data):
  4221. """Check if it is possible to execute live migration.
  4222. This checks if the live migration can succeed, based on the
  4223. results from check_can_live_migrate_destination.
  4224. :param context: security context
  4225. :param instance: dict of instance data
  4226. :param dest_check_data: result of check_can_live_migrate_destination
  4227. :returns: a dict containing migration info
  4228. """
  4229. is_volume_backed = self.compute_api.is_volume_backed_instance(ctxt,
  4230. instance)
  4231. dest_check_data['is_volume_backed'] = is_volume_backed
  4232. block_device_info = self._get_instance_block_device_info(
  4233. ctxt, instance, refresh_conn_info=True)
  4234. return self.driver.check_can_live_migrate_source(ctxt, instance,
  4235. dest_check_data,
  4236. block_device_info)
  4237. @object_compat
  4238. @wrap_exception()
  4239. @wrap_instance_fault
  4240. def pre_live_migration(self, context, instance, block_migration, disk,
  4241. migrate_data):
  4242. """Preparations for live migration at dest host.
  4243. :param context: security context
  4244. :param instance: dict of instance data
  4245. :param block_migration: if true, prepare for block migration
  4246. :param migrate_data: if not None, it is a dict which holds data
  4247. required for live migration without shared
  4248. storage.
  4249. """
  4250. block_device_info = self._get_instance_block_device_info(
  4251. context, instance, refresh_conn_info=True)
  4252. network_info = self._get_instance_nw_info(context, instance)
  4253. self._notify_about_instance_usage(
  4254. context, instance, "live_migration.pre.start",
  4255. network_info=network_info)
  4256. pre_live_migration_data = self.driver.pre_live_migration(context,
  4257. instance,
  4258. block_device_info,
  4259. network_info,
  4260. disk,
  4261. migrate_data)
  4262. # NOTE(tr3buchet): setup networks on destination host
  4263. self.network_api.setup_networks_on_host(context, instance,
  4264. self.host)
  4265. # Creating filters to hypervisors and firewalls.
  4266. # An example is that nova-instance-instance-xxx,
  4267. # which is written to libvirt.xml(Check "virsh nwfilter-list")
  4268. # This nwfilter is necessary on the destination host.
  4269. # In addition, this method is creating filtering rule
  4270. # onto destination host.
  4271. self.driver.ensure_filtering_rules_for_instance(instance,
  4272. network_info)
  4273. self._notify_about_instance_usage(
  4274. context, instance, "live_migration.pre.end",
  4275. network_info=network_info)
  4276. return pre_live_migration_data
  4277. @wrap_exception()
  4278. @wrap_instance_fault
  4279. def live_migration(self, context, dest, instance, block_migration,
  4280. migrate_data):
  4281. """Executing live migration.
  4282. :param context: security context
  4283. :param instance: a nova.objects.instance.Instance object
  4284. :param dest: destination host
  4285. :param block_migration: if true, prepare for block migration
  4286. :param migrate_data: implementation specific params
  4287. """
  4288. # NOTE(danms): since instance is not the first parameter, we can't
  4289. # use @object_compat on this method. Since this is the only example,
  4290. # we do this manually instead of complicating the decorator
  4291. if not isinstance(instance, obj_base.NovaObject):
  4292. expected = ['metadata', 'system_metadata',
  4293. 'security_groups', 'info_cache']
  4294. instance = objects.Instance._from_db_object(
  4295. context, objects.Instance(), instance,
  4296. expected_attrs=expected)
  4297. # Create a local copy since we'll be modifying the dictionary
  4298. migrate_data = dict(migrate_data or {})
  4299. try:
  4300. if block_migration:
  4301. block_device_info = self._get_instance_block_device_info(
  4302. context, instance)
  4303. disk = self.driver.get_instance_disk_info(
  4304. instance.name, block_device_info=block_device_info)
  4305. else:
  4306. disk = None
  4307. pre_migration_data = self.compute_rpcapi.pre_live_migration(
  4308. context, instance,
  4309. block_migration, disk, dest, migrate_data)
  4310. migrate_data['pre_live_migration_result'] = pre_migration_data
  4311. except Exception:
  4312. with excutils.save_and_reraise_exception():
  4313. LOG.exception(_LE('Pre live migration failed at %s'),
  4314. dest, instance=instance)
  4315. self._rollback_live_migration(context, instance, dest,
  4316. block_migration, migrate_data)
  4317. # Executing live migration
  4318. # live_migration might raises exceptions, but
  4319. # nothing must be recovered in this version.
  4320. self.driver.live_migration(context, instance, dest,
  4321. self._post_live_migration,
  4322. self._rollback_live_migration,
  4323. block_migration, migrate_data)
  4324. def _live_migration_cleanup_flags(self, block_migration, migrate_data):
  4325. """Determine whether disks or intance path need to be cleaned up after
  4326. live migration (at source on success, at destination on rollback)
  4327. Block migration needs empty image at destination host before migration
  4328. starts, so if any failure occurs, any empty images has to be deleted.
  4329. Also Volume backed live migration w/o shared storage needs to delete
  4330. newly created instance-xxx dir on the destination as a part of its
  4331. rollback process
  4332. :param block_migration: if true, it was a block migration
  4333. :param migrate_data: implementation specific data
  4334. :returns: (bool, bool) -- do_cleanup, destroy_disks
  4335. """
  4336. # NOTE(angdraug): block migration wouldn't have been allowed if either
  4337. # block storage or instance path were shared
  4338. is_shared_block_storage = not block_migration
  4339. is_shared_instance_path = not block_migration
  4340. if migrate_data:
  4341. is_shared_block_storage = migrate_data.get(
  4342. 'is_shared_block_storage', is_shared_block_storage)
  4343. is_shared_instance_path = migrate_data.get(
  4344. 'is_shared_instance_path', is_shared_instance_path)
  4345. # No instance booting at source host, but instance dir
  4346. # must be deleted for preparing next block migration
  4347. # must be deleted for preparing next live migration w/o shared storage
  4348. do_cleanup = block_migration or not is_shared_instance_path
  4349. destroy_disks = not is_shared_block_storage
  4350. return (do_cleanup, destroy_disks)
  4351. @wrap_exception()
  4352. @wrap_instance_fault
  4353. def _post_live_migration(self, ctxt, instance,
  4354. dest, block_migration=False, migrate_data=None):
  4355. """Post operations for live migration.
  4356. This method is called from live_migration
  4357. and mainly updating database record.
  4358. :param ctxt: security context
  4359. :param instance: instance dict
  4360. :param dest: destination host
  4361. :param block_migration: if true, prepare for block migration
  4362. :param migrate_data: if not None, it is a dict which has data
  4363. required for live migration without shared storage
  4364. """
  4365. LOG.info(_('_post_live_migration() is started..'),
  4366. instance=instance)
  4367. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4368. ctxt, instance['uuid'])
  4369. # Cleanup source host post live-migration
  4370. block_device_info = self._get_instance_block_device_info(
  4371. ctxt, instance, bdms=bdms)
  4372. self.driver.post_live_migration(ctxt, instance, block_device_info,
  4373. migrate_data)
  4374. # Detaching volumes.
  4375. connector = self.driver.get_volume_connector(instance)
  4376. for bdm in bdms:
  4377. # NOTE(vish): We don't want to actually mark the volume
  4378. # detached, or delete the bdm, just remove the
  4379. # connection from this host.
  4380. # remove the volume connection without detaching from hypervisor
  4381. # because the instance is not running anymore on the current host
  4382. if bdm.is_volume:
  4383. self.volume_api.terminate_connection(ctxt, bdm.volume_id,
  4384. connector)
  4385. # Releasing vlan.
  4386. # (not necessary in current implementation?)
  4387. network_info = self._get_instance_nw_info(ctxt, instance)
  4388. self._notify_about_instance_usage(ctxt, instance,
  4389. "live_migration._post.start",
  4390. network_info=network_info)
  4391. # Releasing security group ingress rule.
  4392. self.driver.unfilter_instance(instance,
  4393. network_info)
  4394. migration = {'source_compute': self.host,
  4395. 'dest_compute': dest, }
  4396. self.network_api.migrate_instance_start(ctxt,
  4397. instance,
  4398. migration)
  4399. destroy_vifs = False
  4400. try:
  4401. self.driver.post_live_migration_at_source(ctxt, instance,
  4402. network_info)
  4403. except NotImplementedError as ex:
  4404. LOG.debug(ex, instance=instance)
  4405. # For all hypervisors other than libvirt, there is a possibility
  4406. # they are unplugging networks from source node in the cleanup
  4407. # method
  4408. destroy_vifs = True
  4409. # Define domain at destination host, without doing it,
  4410. # pause/suspend/terminate do not work.
  4411. self.compute_rpcapi.post_live_migration_at_destination(ctxt,
  4412. instance, block_migration, dest)
  4413. do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
  4414. block_migration, migrate_data)
  4415. if do_cleanup:
  4416. self.driver.cleanup(ctxt, instance, network_info,
  4417. destroy_disks=destroy_disks,
  4418. migrate_data=migrate_data,
  4419. destroy_vifs=destroy_vifs)
  4420. # NOTE(tr3buchet): tear down networks on source host
  4421. self.network_api.setup_networks_on_host(ctxt, instance,
  4422. self.host, teardown=True)
  4423. self.instance_events.clear_events_for_instance(instance)
  4424. # NOTE(timello): make sure we update available resources on source
  4425. # host even before next periodic task.
  4426. self.update_available_resource(ctxt)
  4427. self._notify_about_instance_usage(ctxt, instance,
  4428. "live_migration._post.end",
  4429. network_info=network_info)
  4430. LOG.info(_('Migrating instance to %s finished successfully.'),
  4431. dest, instance=instance)
  4432. LOG.info(_("You may see the error \"libvirt: QEMU error: "
  4433. "Domain not found: no domain with matching name.\" "
  4434. "This error can be safely ignored."),
  4435. instance=instance)
  4436. if CONF.vnc_enabled or CONF.spice.enabled or CONF.rdp.enabled:
  4437. if CONF.cells.enable:
  4438. self.cells_rpcapi.consoleauth_delete_tokens(ctxt,
  4439. instance['uuid'])
  4440. else:
  4441. self.consoleauth_rpcapi.delete_tokens_for_instance(ctxt,
  4442. instance['uuid'])
  4443. @object_compat
  4444. @wrap_exception()
  4445. @wrap_instance_fault
  4446. def post_live_migration_at_destination(self, context, instance,
  4447. block_migration):
  4448. """Post operations for live migration .
  4449. :param context: security context
  4450. :param instance: Instance dict
  4451. :param block_migration: if true, prepare for block migration
  4452. """
  4453. LOG.info(_('Post operation of migration started'),
  4454. instance=instance)
  4455. # NOTE(tr3buchet): setup networks on destination host
  4456. # this is called a second time because
  4457. # multi_host does not create the bridge in
  4458. # plug_vifs
  4459. self.network_api.setup_networks_on_host(context, instance,
  4460. self.host)
  4461. migration = {'source_compute': instance['host'],
  4462. 'dest_compute': self.host, }
  4463. self.network_api.migrate_instance_finish(context,
  4464. instance,
  4465. migration)
  4466. network_info = self._get_instance_nw_info(context, instance)
  4467. self._notify_about_instance_usage(
  4468. context, instance, "live_migration.post.dest.start",
  4469. network_info=network_info)
  4470. block_device_info = self._get_instance_block_device_info(context,
  4471. instance)
  4472. self.driver.post_live_migration_at_destination(context, instance,
  4473. network_info,
  4474. block_migration, block_device_info)
  4475. # Restore instance state
  4476. current_power_state = self._get_power_state(context, instance)
  4477. node_name = None
  4478. try:
  4479. compute_node = self._get_compute_info(context, self.host)
  4480. node_name = compute_node.hypervisor_hostname
  4481. except exception.NotFound:
  4482. LOG.exception(_LE('Failed to get compute_info for %s'), self.host)
  4483. finally:
  4484. instance.host = self.host
  4485. instance.power_state = current_power_state
  4486. instance.vm_state = vm_states.ACTIVE
  4487. instance.task_state = None
  4488. instance.node = node_name
  4489. instance.save(expected_task_state=task_states.MIGRATING)
  4490. # NOTE(vish): this is necessary to update dhcp
  4491. self.network_api.setup_networks_on_host(context, instance, self.host)
  4492. self._notify_about_instance_usage(
  4493. context, instance, "live_migration.post.dest.end",
  4494. network_info=network_info)
  4495. @wrap_exception()
  4496. @wrap_instance_fault
  4497. def _rollback_live_migration(self, context, instance,
  4498. dest, block_migration, migrate_data=None):
  4499. """Recovers Instance/volume state from migrating -> running.
  4500. :param context: security context
  4501. :param instance: nova.db.sqlalchemy.models.Instance
  4502. :param dest:
  4503. This method is called from live migration src host.
  4504. This param specifies destination host.
  4505. :param block_migration: if true, prepare for block migration
  4506. :param migrate_data:
  4507. if not none, contains implementation specific data.
  4508. """
  4509. instance.vm_state = vm_states.ACTIVE
  4510. instance.task_state = None
  4511. instance.save(expected_task_state=[task_states.MIGRATING])
  4512. # NOTE(tr3buchet): setup networks on source host (really it's re-setup)
  4513. self.network_api.setup_networks_on_host(context, instance, self.host)
  4514. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4515. context, instance['uuid'])
  4516. for bdm in bdms:
  4517. if bdm.is_volume:
  4518. self.compute_rpcapi.remove_volume_connection(
  4519. context, instance, bdm.volume_id, dest)
  4520. self._notify_about_instance_usage(context, instance,
  4521. "live_migration._rollback.start")
  4522. do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
  4523. block_migration, migrate_data)
  4524. if do_cleanup:
  4525. self.compute_rpcapi.rollback_live_migration_at_destination(
  4526. context, instance, dest, destroy_disks=destroy_disks,
  4527. migrate_data=migrate_data)
  4528. self._notify_about_instance_usage(context, instance,
  4529. "live_migration._rollback.end")
  4530. @object_compat
  4531. @wrap_exception()
  4532. @wrap_instance_fault
  4533. def rollback_live_migration_at_destination(self, context, instance,
  4534. destroy_disks=True,
  4535. migrate_data=None):
  4536. """Cleaning up image directory that is created pre_live_migration.
  4537. :param context: security context
  4538. :param instance: a nova.objects.instance.Instance object sent over rpc
  4539. """
  4540. network_info = self._get_instance_nw_info(context, instance)
  4541. self._notify_about_instance_usage(
  4542. context, instance, "live_migration.rollback.dest.start",
  4543. network_info=network_info)
  4544. # NOTE(tr3buchet): tear down networks on destination host
  4545. self.network_api.setup_networks_on_host(context, instance,
  4546. self.host, teardown=True)
  4547. # NOTE(vish): The mapping is passed in so the driver can disconnect
  4548. # from remote volumes if necessary
  4549. block_device_info = self._get_instance_block_device_info(context,
  4550. instance)
  4551. self.driver.rollback_live_migration_at_destination(
  4552. context, instance, network_info, block_device_info,
  4553. destroy_disks=destroy_disks, migrate_data=migrate_data)
  4554. self._notify_about_instance_usage(
  4555. context, instance, "live_migration.rollback.dest.end",
  4556. network_info=network_info)
  4557. @periodic_task.periodic_task(
  4558. spacing=CONF.heal_instance_info_cache_interval)
  4559. def _heal_instance_info_cache(self, context):
  4560. """Called periodically. On every call, try to update the
  4561. info_cache's network information for another instance by
  4562. calling to the network manager.
  4563. This is implemented by keeping a cache of uuids of instances
  4564. that live on this host. On each call, we pop one off of a
  4565. list, pull the DB record, and try the call to the network API.
  4566. If anything errors don't fail, as it's possible the instance
  4567. has been deleted, etc.
  4568. """
  4569. heal_interval = CONF.heal_instance_info_cache_interval
  4570. if not heal_interval:
  4571. return
  4572. instance_uuids = getattr(self, '_instance_uuids_to_heal', [])
  4573. instance = None
  4574. LOG.debug('Starting heal instance info cache')
  4575. if not instance_uuids:
  4576. # The list of instances to heal is empty so rebuild it
  4577. LOG.debug('Rebuilding the list of instances to heal')
  4578. db_instances = objects.InstanceList.get_by_host(
  4579. context, self.host, expected_attrs=[], use_slave=True)
  4580. for inst in db_instances:
  4581. # We don't want to refresh the cache for instances
  4582. # which are building or deleting so don't put them
  4583. # in the list. If they are building they will get
  4584. # added to the list next time we build it.
  4585. if (inst.vm_state == vm_states.BUILDING):
  4586. LOG.debug('Skipping network cache update for instance '
  4587. 'because it is Building.', instance=inst)
  4588. continue
  4589. if (inst.task_state == task_states.DELETING):
  4590. LOG.debug('Skipping network cache update for instance '
  4591. 'because it is being deleted.', instance=inst)
  4592. continue
  4593. if not instance:
  4594. # Save the first one we find so we don't
  4595. # have to get it again
  4596. instance = inst
  4597. else:
  4598. instance_uuids.append(inst['uuid'])
  4599. self._instance_uuids_to_heal = instance_uuids
  4600. else:
  4601. # Find the next valid instance on the list
  4602. while instance_uuids:
  4603. try:
  4604. inst = objects.Instance.get_by_uuid(
  4605. context, instance_uuids.pop(0),
  4606. expected_attrs=['system_metadata', 'info_cache'],
  4607. use_slave=True)
  4608. except exception.InstanceNotFound:
  4609. # Instance is gone. Try to grab another.
  4610. continue
  4611. # Check the instance hasn't been migrated
  4612. if inst.host != self.host:
  4613. LOG.debug('Skipping network cache update for instance '
  4614. 'because it has been migrated to another '
  4615. 'host.', instance=inst)
  4616. # Check the instance isn't being deleting
  4617. elif inst.task_state == task_states.DELETING:
  4618. LOG.debug('Skipping network cache update for instance '
  4619. 'because it is being deleted.', instance=inst)
  4620. else:
  4621. instance = inst
  4622. break
  4623. if instance:
  4624. # We have an instance now to refresh
  4625. try:
  4626. # Call to network API to get instance info.. this will
  4627. # force an update to the instance's info_cache
  4628. self._get_instance_nw_info(context, instance, use_slave=True)
  4629. LOG.debug('Updated the network info_cache for instance',
  4630. instance=instance)
  4631. except exception.InstanceNotFound:
  4632. # Instance is gone.
  4633. LOG.debug('Instance no longer exists. Unable to refresh',
  4634. instance=instance)
  4635. return
  4636. except Exception:
  4637. LOG.error(_LE('An error occurred while refreshing the network '
  4638. 'cache.'), instance=instance, exc_info=True)
  4639. else:
  4640. LOG.debug("Didn't find any instances for network info cache "
  4641. "update.")
  4642. @periodic_task.periodic_task
  4643. def _poll_rebooting_instances(self, context):
  4644. if CONF.reboot_timeout > 0:
  4645. filters = {'task_state': task_states.REBOOTING,
  4646. 'host': self.host}
  4647. rebooting = objects.InstanceList.get_by_filters(
  4648. context, filters, expected_attrs=[], use_slave=True)
  4649. to_poll = []
  4650. for instance in rebooting:
  4651. if timeutils.is_older_than(instance['updated_at'],
  4652. CONF.reboot_timeout):
  4653. to_poll.append(instance)
  4654. self.driver.poll_rebooting_instances(CONF.reboot_timeout, to_poll)
  4655. @periodic_task.periodic_task
  4656. def _poll_rescued_instances(self, context):
  4657. if CONF.rescue_timeout > 0:
  4658. filters = {'vm_state': vm_states.RESCUED,
  4659. 'host': self.host}
  4660. rescued_instances = objects.InstanceList.get_by_filters(
  4661. context, filters, expected_attrs=["system_metadata"],
  4662. use_slave=True)
  4663. to_unrescue = []
  4664. for instance in rescued_instances:
  4665. if timeutils.is_older_than(instance['launched_at'],
  4666. CONF.rescue_timeout):
  4667. to_unrescue.append(instance)
  4668. for instance in to_unrescue:
  4669. self.compute_api.unrescue(context, instance)
  4670. @periodic_task.periodic_task
  4671. def _poll_unconfirmed_resizes(self, context):
  4672. if CONF.resize_confirm_window == 0:
  4673. return
  4674. migrations = objects.MigrationList.get_unconfirmed_by_dest_compute(
  4675. context, CONF.resize_confirm_window, self.host,
  4676. use_slave=True)
  4677. migrations_info = dict(migration_count=len(migrations),
  4678. confirm_window=CONF.resize_confirm_window)
  4679. if migrations_info["migration_count"] > 0:
  4680. LOG.info(_("Found %(migration_count)d unconfirmed migrations "
  4681. "older than %(confirm_window)d seconds"),
  4682. migrations_info)
  4683. def _set_migration_to_error(migration, reason, **kwargs):
  4684. LOG.warn(_("Setting migration %(migration_id)s to error: "
  4685. "%(reason)s"),
  4686. {'migration_id': migration['id'], 'reason': reason},
  4687. **kwargs)
  4688. migration.status = 'error'
  4689. migration.save(context.elevated())
  4690. for migration in migrations:
  4691. instance_uuid = migration.instance_uuid
  4692. LOG.info(_("Automatically confirming migration "
  4693. "%(migration_id)s for instance %(instance_uuid)s"),
  4694. {'migration_id': migration.id,
  4695. 'instance_uuid': instance_uuid})
  4696. expected_attrs = ['metadata', 'system_metadata']
  4697. try:
  4698. instance = objects.Instance.get_by_uuid(context,
  4699. instance_uuid, expected_attrs=expected_attrs,
  4700. use_slave=True)
  4701. except exception.InstanceNotFound:
  4702. reason = (_("Instance %s not found") %
  4703. instance_uuid)
  4704. _set_migration_to_error(migration, reason)
  4705. continue
  4706. if instance['vm_state'] == vm_states.ERROR:
  4707. reason = _("In ERROR state")
  4708. _set_migration_to_error(migration, reason,
  4709. instance=instance)
  4710. continue
  4711. # race condition: The instance in DELETING state should not be
  4712. # set the migration state to error, otherwise the instance in
  4713. # to be deleted which is in RESIZED state
  4714. # will not be able to confirm resize
  4715. if instance.task_state in [task_states.DELETING,
  4716. task_states.SOFT_DELETING]:
  4717. msg = ("Instance being deleted or soft deleted during resize "
  4718. "confirmation. Skipping.")
  4719. LOG.debug(msg, instance=instance)
  4720. continue
  4721. # race condition: This condition is hit when this method is
  4722. # called between the save of the migration record with a status of
  4723. # finished and the save of the instance object with a state of
  4724. # RESIZED. The migration record should not be set to error.
  4725. if instance.task_state == task_states.RESIZE_FINISH:
  4726. msg = ("Instance still resizing during resize "
  4727. "confirmation. Skipping.")
  4728. LOG.debug(msg, instance=instance)
  4729. continue
  4730. vm_state = instance['vm_state']
  4731. task_state = instance['task_state']
  4732. if vm_state != vm_states.RESIZED or task_state is not None:
  4733. reason = (_("In states %(vm_state)s/%(task_state)s, not "
  4734. "RESIZED/None") %
  4735. {'vm_state': vm_state,
  4736. 'task_state': task_state})
  4737. _set_migration_to_error(migration, reason,
  4738. instance=instance)
  4739. continue
  4740. try:
  4741. self.compute_api.confirm_resize(context, instance,
  4742. migration=migration)
  4743. except Exception as e:
  4744. LOG.info(_("Error auto-confirming resize: %s. "
  4745. "Will retry later."),
  4746. e, instance=instance)
  4747. @periodic_task.periodic_task(spacing=CONF.shelved_poll_interval)
  4748. def _poll_shelved_instances(self, context):
  4749. filters = {'vm_state': vm_states.SHELVED,
  4750. 'host': self.host}
  4751. shelved_instances = objects.InstanceList.get_by_filters(
  4752. context, filters=filters, expected_attrs=['system_metadata'],
  4753. use_slave=True)
  4754. to_gc = []
  4755. for instance in shelved_instances:
  4756. sys_meta = instance.system_metadata
  4757. shelved_at = timeutils.parse_strtime(sys_meta['shelved_at'])
  4758. if timeutils.is_older_than(shelved_at, CONF.shelved_offload_time):
  4759. to_gc.append(instance)
  4760. for instance in to_gc:
  4761. try:
  4762. instance.task_state = task_states.SHELVING_OFFLOADING
  4763. instance.save()
  4764. self.shelve_offload_instance(context, instance)
  4765. except Exception:
  4766. LOG.exception(_LE('Periodic task failed to offload instance.'),
  4767. instance=instance)
  4768. @periodic_task.periodic_task
  4769. def _instance_usage_audit(self, context):
  4770. if not CONF.instance_usage_audit:
  4771. return
  4772. if compute_utils.has_audit_been_run(context,
  4773. self.conductor_api,
  4774. self.host):
  4775. return
  4776. begin, end = utils.last_completed_audit_period()
  4777. instances = objects.InstanceList.get_active_by_window_joined(
  4778. context, begin, end, host=self.host,
  4779. expected_attrs=['system_metadata', 'info_cache', 'metadata'],
  4780. use_slave=True)
  4781. num_instances = len(instances)
  4782. errors = 0
  4783. successes = 0
  4784. LOG.info(_("Running instance usage audit for"
  4785. " host %(host)s from %(begin_time)s to "
  4786. "%(end_time)s. %(number_instances)s"
  4787. " instances."),
  4788. dict(host=self.host,
  4789. begin_time=begin,
  4790. end_time=end,
  4791. number_instances=num_instances))
  4792. start_time = time.time()
  4793. compute_utils.start_instance_usage_audit(context,
  4794. self.conductor_api,
  4795. begin, end,
  4796. self.host, num_instances)
  4797. for instance in instances:
  4798. try:
  4799. self.conductor_api.notify_usage_exists(
  4800. context, instance,
  4801. ignore_missing_network_data=False)
  4802. successes += 1
  4803. except Exception:
  4804. LOG.exception(_LE('Failed to generate usage '
  4805. 'audit for instance '
  4806. 'on host %s'), self.host,
  4807. instance=instance)
  4808. errors += 1
  4809. compute_utils.finish_instance_usage_audit(context,
  4810. self.conductor_api,
  4811. begin, end,
  4812. self.host, errors,
  4813. "Instance usage audit ran "
  4814. "for host %s, %s instances "
  4815. "in %s seconds." % (
  4816. self.host,
  4817. num_instances,
  4818. time.time() - start_time))
  4819. @periodic_task.periodic_task(spacing=CONF.bandwidth_poll_interval)
  4820. def _poll_bandwidth_usage(self, context):
  4821. if not self._bw_usage_supported:
  4822. return
  4823. prev_time, start_time = utils.last_completed_audit_period()
  4824. curr_time = time.time()
  4825. if (curr_time - self._last_bw_usage_poll >
  4826. CONF.bandwidth_poll_interval):
  4827. self._last_bw_usage_poll = curr_time
  4828. LOG.info(_("Updating bandwidth usage cache"))
  4829. cells_update_interval = CONF.cells.bandwidth_update_interval
  4830. if (cells_update_interval > 0 and
  4831. curr_time - self._last_bw_usage_cell_update >
  4832. cells_update_interval):
  4833. self._last_bw_usage_cell_update = curr_time
  4834. update_cells = True
  4835. else:
  4836. update_cells = False
  4837. instances = objects.InstanceList.get_by_host(context,
  4838. self.host,
  4839. use_slave=True)
  4840. try:
  4841. bw_counters = self.driver.get_all_bw_counters(instances)
  4842. except NotImplementedError:
  4843. # NOTE(mdragon): Not all hypervisors have bandwidth polling
  4844. # implemented yet. If they don't it doesn't break anything,
  4845. # they just don't get the info in the usage events.
  4846. # NOTE(PhilDay): Record that its not supported so we can
  4847. # skip fast on future calls rather than waste effort getting
  4848. # the list of instances.
  4849. LOG.warning(_("Bandwidth usage not supported by hypervisor."))
  4850. self._bw_usage_supported = False
  4851. return
  4852. refreshed = timeutils.utcnow()
  4853. for bw_ctr in bw_counters:
  4854. # Allow switching of greenthreads between queries.
  4855. greenthread.sleep(0)
  4856. bw_in = 0
  4857. bw_out = 0
  4858. last_ctr_in = None
  4859. last_ctr_out = None
  4860. usage = objects.BandwidthUsage.get_by_instance_uuid_and_mac(
  4861. context, bw_ctr['uuid'], bw_ctr['mac_address'],
  4862. start_period=start_time, use_slave=True)
  4863. if usage:
  4864. bw_in = usage.bw_in
  4865. bw_out = usage.bw_out
  4866. last_ctr_in = usage.last_ctr_in
  4867. last_ctr_out = usage.last_ctr_out
  4868. else:
  4869. usage = (objects.BandwidthUsage.
  4870. get_by_instance_uuid_and_mac(
  4871. context, bw_ctr['uuid'], bw_ctr['mac_address'],
  4872. start_period=prev_time, use_slave=True))
  4873. if usage:
  4874. last_ctr_in = usage.last_ctr_in
  4875. last_ctr_out = usage.last_ctr_out
  4876. if last_ctr_in is not None:
  4877. if bw_ctr['bw_in'] < last_ctr_in:
  4878. # counter rollover
  4879. bw_in += bw_ctr['bw_in']
  4880. else:
  4881. bw_in += (bw_ctr['bw_in'] - last_ctr_in)
  4882. if last_ctr_out is not None:
  4883. if bw_ctr['bw_out'] < last_ctr_out:
  4884. # counter rollover
  4885. bw_out += bw_ctr['bw_out']
  4886. else:
  4887. bw_out += (bw_ctr['bw_out'] - last_ctr_out)
  4888. objects.BandwidthUsage.create(context,
  4889. bw_ctr['uuid'],
  4890. bw_ctr['mac_address'],
  4891. bw_in,
  4892. bw_out,
  4893. bw_ctr['bw_in'],
  4894. bw_ctr['bw_out'],
  4895. start_period=start_time,
  4896. last_refreshed=refreshed,
  4897. update_cells=update_cells)
  4898. def _get_host_volume_bdms(self, context, use_slave=False):
  4899. """Return all block device mappings on a compute host."""
  4900. compute_host_bdms = []
  4901. instances = objects.InstanceList.get_by_host(context, self.host)
  4902. for instance in instances:
  4903. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4904. context, instance.uuid, use_slave=use_slave)
  4905. instance_bdms = [bdm for bdm in bdms if bdm.is_volume]
  4906. compute_host_bdms.append(dict(instance=instance,
  4907. instance_bdms=instance_bdms))
  4908. return compute_host_bdms
  4909. def _update_volume_usage_cache(self, context, vol_usages):
  4910. """Updates the volume usage cache table with a list of stats."""
  4911. for usage in vol_usages:
  4912. # Allow switching of greenthreads between queries.
  4913. greenthread.sleep(0)
  4914. self.conductor_api.vol_usage_update(context, usage['volume'],
  4915. usage['rd_req'],
  4916. usage['rd_bytes'],
  4917. usage['wr_req'],
  4918. usage['wr_bytes'],
  4919. usage['instance'])
  4920. @periodic_task.periodic_task(spacing=CONF.volume_usage_poll_interval)
  4921. def _poll_volume_usage(self, context, start_time=None):
  4922. if CONF.volume_usage_poll_interval == 0:
  4923. return
  4924. if not start_time:
  4925. start_time = utils.last_completed_audit_period()[1]
  4926. compute_host_bdms = self._get_host_volume_bdms(context,
  4927. use_slave=True)
  4928. if not compute_host_bdms:
  4929. return
  4930. LOG.debug("Updating volume usage cache")
  4931. try:
  4932. vol_usages = self.driver.get_all_volume_usage(context,
  4933. compute_host_bdms)
  4934. except NotImplementedError:
  4935. return
  4936. self._update_volume_usage_cache(context, vol_usages)
  4937. @periodic_task.periodic_task(spacing=CONF.sync_power_state_interval,
  4938. run_immediately=True)
  4939. def _sync_power_states(self, context):
  4940. """Align power states between the database and the hypervisor.
  4941. To sync power state data we make a DB call to get the number of
  4942. virtual machines known by the hypervisor and if the number matches the
  4943. number of virtual machines known by the database, we proceed in a lazy
  4944. loop, one database record at a time, checking if the hypervisor has the
  4945. same power state as is in the database.
  4946. """
  4947. db_instances = objects.InstanceList.get_by_host(context,
  4948. self.host,
  4949. use_slave=True)
  4950. num_vm_instances = self.driver.get_num_instances()
  4951. num_db_instances = len(db_instances)
  4952. if num_vm_instances != num_db_instances:
  4953. LOG.warn(_("While synchronizing instance power states, found "
  4954. "%(num_db_instances)s instances in the database and "
  4955. "%(num_vm_instances)s instances on the hypervisor."),
  4956. {'num_db_instances': num_db_instances,
  4957. 'num_vm_instances': num_vm_instances})
  4958. def _sync(db_instance):
  4959. # NOTE(melwitt): This must be synchronized as we query state from
  4960. # two separate sources, the driver and the database.
  4961. # They are set (in stop_instance) and read, in sync.
  4962. @utils.synchronized(db_instance.uuid)
  4963. def query_driver_power_state_and_sync():
  4964. self._query_driver_power_state_and_sync(context, db_instance)
  4965. try:
  4966. query_driver_power_state_and_sync()
  4967. except Exception:
  4968. LOG.exception(_LE("Periodic sync_power_state task had an "
  4969. "error while processing an instance."),
  4970. instance=db_instance)
  4971. self._syncs_in_progress.pop(db_instance.uuid)
  4972. for db_instance in db_instances:
  4973. # process syncs asynchronously - don't want instance locking to
  4974. # block entire periodic task thread
  4975. uuid = db_instance.uuid
  4976. if uuid in self._syncs_in_progress:
  4977. LOG.debug('Sync already in progress for %s' % uuid)
  4978. else:
  4979. LOG.debug('Triggering sync for uuid %s' % uuid)
  4980. self._syncs_in_progress[uuid] = True
  4981. self._sync_power_pool.spawn_n(_sync, db_instance)
  4982. def _query_driver_power_state_and_sync(self, context, db_instance):
  4983. if db_instance.task_state is not None:
  4984. LOG.info(_LI("During sync_power_state the instance has a "
  4985. "pending task (%(task)s). Skip."),
  4986. {'task': db_instance.task_state}, instance=db_instance)
  4987. return
  4988. # No pending tasks. Now try to figure out the real vm_power_state.
  4989. try:
  4990. vm_instance = self.driver.get_info(db_instance)
  4991. vm_power_state = vm_instance['state']
  4992. except exception.InstanceNotFound:
  4993. vm_power_state = power_state.NOSTATE
  4994. # Note(maoy): the above get_info call might take a long time,
  4995. # for example, because of a broken libvirt driver.
  4996. try:
  4997. self._sync_instance_power_state(context,
  4998. db_instance,
  4999. vm_power_state,
  5000. use_slave=True)
  5001. except exception.InstanceNotFound:
  5002. # NOTE(hanlind): If the instance gets deleted during sync,
  5003. # silently ignore.
  5004. pass
  5005. def _sync_instance_power_state(self, context, db_instance, vm_power_state,
  5006. use_slave=False):
  5007. """Align instance power state between the database and hypervisor.
  5008. If the instance is not found on the hypervisor, but is in the database,
  5009. then a stop() API will be called on the instance.
  5010. """
  5011. # We re-query the DB to get the latest instance info to minimize
  5012. # (not eliminate) race condition.
  5013. db_instance.refresh(use_slave=use_slave)
  5014. db_power_state = db_instance.power_state
  5015. vm_state = db_instance.vm_state
  5016. if self.host != db_instance.host:
  5017. # on the sending end of nova-compute _sync_power_state
  5018. # may have yielded to the greenthread performing a live
  5019. # migration; this in turn has changed the resident-host
  5020. # for the VM; However, the instance is still active, it
  5021. # is just in the process of migrating to another host.
  5022. # This implies that the compute source must relinquish
  5023. # control to the compute destination.
  5024. LOG.info(_("During the sync_power process the "
  5025. "instance has moved from "
  5026. "host %(src)s to host %(dst)s") %
  5027. {'src': db_instance.host,
  5028. 'dst': self.host},
  5029. instance=db_instance)
  5030. return
  5031. elif db_instance.task_state is not None:
  5032. # on the receiving end of nova-compute, it could happen
  5033. # that the DB instance already report the new resident
  5034. # but the actual VM has not showed up on the hypervisor
  5035. # yet. In this case, let's allow the loop to continue
  5036. # and run the state sync in a later round
  5037. LOG.info(_("During sync_power_state the instance has a "
  5038. "pending task (%(task)s). Skip."),
  5039. {'task': db_instance.task_state},
  5040. instance=db_instance)
  5041. return
  5042. if vm_power_state != db_power_state:
  5043. # power_state is always updated from hypervisor to db
  5044. db_instance.power_state = vm_power_state
  5045. db_instance.save()
  5046. db_power_state = vm_power_state
  5047. # Note(maoy): Now resolve the discrepancy between vm_state and
  5048. # vm_power_state. We go through all possible vm_states.
  5049. if vm_state in (vm_states.BUILDING,
  5050. vm_states.RESCUED,
  5051. vm_states.RESIZED,
  5052. vm_states.SUSPENDED,
  5053. vm_states.ERROR):
  5054. # TODO(maoy): we ignore these vm_state for now.
  5055. pass
  5056. elif vm_state == vm_states.ACTIVE:
  5057. # The only rational power state should be RUNNING
  5058. if vm_power_state in (power_state.SHUTDOWN,
  5059. power_state.CRASHED):
  5060. LOG.warn(_LW("Instance shutdown by itself. Calling the stop "
  5061. "API. Current vm_state: %(vm_state)s, current "
  5062. "task_state: %(task_state)s, current DB "
  5063. "power_state: %(db_power_state)s, current VM "
  5064. "power_state: %(vm_power_state)s"),
  5065. {'vm_state': vm_state,
  5066. 'task_state': db_instance.task_state,
  5067. 'db_power_state': db_power_state,
  5068. 'vm_power_state': vm_power_state},
  5069. instance=db_instance)
  5070. try:
  5071. # Note(maoy): here we call the API instead of
  5072. # brutally updating the vm_state in the database
  5073. # to allow all the hooks and checks to be performed.
  5074. if db_instance.shutdown_terminate:
  5075. self.compute_api.delete(context, db_instance)
  5076. else:
  5077. self.compute_api.stop(context, db_instance)
  5078. except Exception:
  5079. # Note(maoy): there is no need to propagate the error
  5080. # because the same power_state will be retrieved next
  5081. # time and retried.
  5082. # For example, there might be another task scheduled.
  5083. LOG.exception(_LE("error during stop() in "
  5084. "sync_power_state."),
  5085. instance=db_instance)
  5086. elif vm_power_state == power_state.SUSPENDED:
  5087. LOG.warn(_("Instance is suspended unexpectedly. Calling "
  5088. "the stop API."), instance=db_instance)
  5089. try:
  5090. self.compute_api.stop(context, db_instance)
  5091. except Exception:
  5092. LOG.exception(_LE("error during stop() in "
  5093. "sync_power_state."),
  5094. instance=db_instance)
  5095. elif vm_power_state == power_state.PAUSED:
  5096. # Note(maoy): a VM may get into the paused state not only
  5097. # because the user request via API calls, but also
  5098. # due to (temporary) external instrumentations.
  5099. # Before the virt layer can reliably report the reason,
  5100. # we simply ignore the state discrepancy. In many cases,
  5101. # the VM state will go back to running after the external
  5102. # instrumentation is done. See bug 1097806 for details.
  5103. LOG.warn(_("Instance is paused unexpectedly. Ignore."),
  5104. instance=db_instance)
  5105. elif vm_power_state == power_state.NOSTATE:
  5106. # Occasionally, depending on the status of the hypervisor,
  5107. # which could be restarting for example, an instance may
  5108. # not be found. Therefore just log the condition.
  5109. LOG.warn(_("Instance is unexpectedly not found. Ignore."),
  5110. instance=db_instance)
  5111. elif vm_state == vm_states.STOPPED:
  5112. if vm_power_state not in (power_state.NOSTATE,
  5113. power_state.SHUTDOWN,
  5114. power_state.CRASHED):
  5115. LOG.warn(_LW("Instance is not stopped. Calling "
  5116. "the stop API. Current vm_state: %(vm_state)s, "
  5117. "current task_state: %(task_state)s, "
  5118. "current DB power_state: %(db_power_state)s, "
  5119. "current VM power_state: %(vm_power_state)s"),
  5120. {'vm_state': vm_state,
  5121. 'task_state': db_instance.task_state,
  5122. 'db_power_state': db_power_state,
  5123. 'vm_power_state': vm_power_state},
  5124. instance=db_instance)
  5125. try:
  5126. # NOTE(russellb) Force the stop, because normally the
  5127. # compute API would not allow an attempt to stop a stopped
  5128. # instance.
  5129. self.compute_api.force_stop(context, db_instance)
  5130. except Exception:
  5131. LOG.exception(_LE("error during stop() in "
  5132. "sync_power_state."),
  5133. instance=db_instance)
  5134. elif vm_state == vm_states.PAUSED:
  5135. if vm_power_state in (power_state.SHUTDOWN,
  5136. power_state.CRASHED):
  5137. LOG.warn(_("Paused instance shutdown by itself. Calling "
  5138. "the stop API."), instance=db_instance)
  5139. try:
  5140. self.compute_api.force_stop(context, db_instance)
  5141. except Exception:
  5142. LOG.exception(_LE("error during stop() in "
  5143. "sync_power_state."),
  5144. instance=db_instance)
  5145. elif vm_state in (vm_states.SOFT_DELETED,
  5146. vm_states.DELETED):
  5147. if vm_power_state not in (power_state.NOSTATE,
  5148. power_state.SHUTDOWN):
  5149. # Note(maoy): this should be taken care of periodically in
  5150. # _cleanup_running_deleted_instances().
  5151. LOG.warn(_("Instance is not (soft-)deleted."),
  5152. instance=db_instance)
  5153. @periodic_task.periodic_task
  5154. def _reclaim_queued_deletes(self, context):
  5155. """Reclaim instances that are queued for deletion."""
  5156. interval = CONF.reclaim_instance_interval
  5157. if interval <= 0:
  5158. LOG.debug("CONF.reclaim_instance_interval <= 0, skipping...")
  5159. return
  5160. # TODO(comstud, jichenjc): Dummy quota object for now See bug 1296414.
  5161. # The only case that the quota might be inconsistent is
  5162. # the compute node died between set instance state to SOFT_DELETED
  5163. # and quota commit to DB. When compute node starts again
  5164. # it will have no idea the reservation is committed or not or even
  5165. # expired, since it's a rare case, so marked as todo.
  5166. quotas = quotas_obj.Quotas.from_reservations(context, None)
  5167. filters = {'vm_state': vm_states.SOFT_DELETED,
  5168. 'task_state': None,
  5169. 'host': self.host}
  5170. instances = objects.InstanceList.get_by_filters(
  5171. context, filters,
  5172. expected_attrs=instance_obj.INSTANCE_DEFAULT_FIELDS,
  5173. use_slave=True)
  5174. for instance in instances:
  5175. if self._deleted_old_enough(instance, interval):
  5176. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5177. context, instance.uuid)
  5178. LOG.info(_('Reclaiming deleted instance'), instance=instance)
  5179. try:
  5180. self._delete_instance(context, instance, bdms, quotas)
  5181. except Exception as e:
  5182. LOG.warning(_("Periodic reclaim failed to delete "
  5183. "instance: %s"),
  5184. e, instance=instance)
  5185. @periodic_task.periodic_task
  5186. def update_available_resource(self, context):
  5187. """See driver.get_available_resource()
  5188. Periodic process that keeps that the compute host's understanding of
  5189. resource availability and usage in sync with the underlying hypervisor.
  5190. :param context: security context
  5191. """
  5192. new_resource_tracker_dict = {}
  5193. nodenames = set(self.driver.get_available_nodes())
  5194. for nodename in nodenames:
  5195. rt = self._get_resource_tracker(nodename)
  5196. rt.update_available_resource(context)
  5197. new_resource_tracker_dict[nodename] = rt
  5198. # Delete orphan compute node not reported by driver but still in db
  5199. compute_nodes_in_db = self._get_compute_nodes_in_db(context,
  5200. use_slave=True)
  5201. for cn in compute_nodes_in_db:
  5202. if cn.hypervisor_hostname not in nodenames:
  5203. LOG.audit(_("Deleting orphan compute node %s") % cn.id)
  5204. cn.destroy()
  5205. self._resource_tracker_dict = new_resource_tracker_dict
  5206. def _get_compute_nodes_in_db(self, context, use_slave=False):
  5207. service = objects.Service.get_by_compute_host(context, self.host,
  5208. use_slave=use_slave)
  5209. if not service:
  5210. LOG.error(_LE("No service record for host %s"), self.host)
  5211. return []
  5212. return objects.ComputeNodeList.get_by_service(context,
  5213. service,
  5214. use_slave=use_slave)
  5215. @periodic_task.periodic_task(
  5216. spacing=CONF.running_deleted_instance_poll_interval)
  5217. def _cleanup_running_deleted_instances(self, context):
  5218. """Cleanup any instances which are erroneously still running after
  5219. having been deleted.
  5220. Valid actions to take are:
  5221. 1. noop - do nothing
  5222. 2. log - log which instances are erroneously running
  5223. 3. reap - shutdown and cleanup any erroneously running instances
  5224. 4. shutdown - power off *and disable* any erroneously running
  5225. instances
  5226. The use-case for this cleanup task is: for various reasons, it may be
  5227. possible for the database to show an instance as deleted but for that
  5228. instance to still be running on a host machine (see bug
  5229. https://bugs.launchpad.net/nova/+bug/911366).
  5230. This cleanup task is a cross-hypervisor utility for finding these
  5231. zombied instances and either logging the discrepancy (likely what you
  5232. should do in production), or automatically reaping the instances (more
  5233. appropriate for dev environments).
  5234. """
  5235. action = CONF.running_deleted_instance_action
  5236. if action == "noop":
  5237. return
  5238. # NOTE(sirp): admin contexts don't ordinarily return deleted records
  5239. with utils.temporary_mutation(context, read_deleted="yes"):
  5240. for instance in self._running_deleted_instances(context):
  5241. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5242. context, instance.uuid, use_slave=True)
  5243. if action == "log":
  5244. LOG.warning(_("Detected instance with name label "
  5245. "'%s' which is marked as "
  5246. "DELETED but still present on host."),
  5247. instance['name'], instance=instance)
  5248. elif action == 'shutdown':
  5249. LOG.info(_("Powering off instance with name label "
  5250. "'%s' which is marked as "
  5251. "DELETED but still present on host."),
  5252. instance['name'], instance=instance)
  5253. try:
  5254. try:
  5255. # disable starting the instance
  5256. self.driver.set_bootable(instance, False)
  5257. except NotImplementedError:
  5258. LOG.warn(_("set_bootable is not implemented for "
  5259. "the current driver"))
  5260. # and power it off
  5261. self.driver.power_off(instance)
  5262. except Exception:
  5263. msg = _("Failed to power off instance")
  5264. LOG.warn(msg, instance=instance, exc_info=True)
  5265. elif action == 'reap':
  5266. LOG.info(_("Destroying instance with name label "
  5267. "'%s' which is marked as "
  5268. "DELETED but still present on host."),
  5269. instance['name'], instance=instance)
  5270. self.instance_events.clear_events_for_instance(instance)
  5271. try:
  5272. self._shutdown_instance(context, instance, bdms,
  5273. notify=False)
  5274. self._cleanup_volumes(context, instance['uuid'], bdms)
  5275. except Exception as e:
  5276. LOG.warning(_("Periodic cleanup failed to delete "
  5277. "instance: %s"),
  5278. e, instance=instance)
  5279. else:
  5280. raise Exception(_("Unrecognized value '%s'"
  5281. " for CONF.running_deleted_"
  5282. "instance_action") % action)
  5283. def _running_deleted_instances(self, context):
  5284. """Returns a list of instances nova thinks is deleted,
  5285. but the hypervisor thinks is still running.
  5286. """
  5287. timeout = CONF.running_deleted_instance_timeout
  5288. filters = {'deleted': True,
  5289. 'soft_deleted': False,
  5290. 'host': self.host}
  5291. instances = self._get_instances_on_driver(context, filters)
  5292. return [i for i in instances if self._deleted_old_enough(i, timeout)]
  5293. def _deleted_old_enough(self, instance, timeout):
  5294. deleted_at = instance['deleted_at']
  5295. if isinstance(instance, obj_base.NovaObject) and deleted_at:
  5296. deleted_at = deleted_at.replace(tzinfo=None)
  5297. return (not deleted_at or timeutils.is_older_than(deleted_at, timeout))
  5298. @contextlib.contextmanager
  5299. def _error_out_instance_on_exception(self, context, instance,
  5300. quotas=None,
  5301. instance_state=vm_states.ACTIVE):
  5302. instance_uuid = instance['uuid']
  5303. try:
  5304. yield
  5305. except NotImplementedError as error:
  5306. with excutils.save_and_reraise_exception():
  5307. if quotas:
  5308. quotas.rollback()
  5309. LOG.info(_("Setting instance back to %(state)s after: "
  5310. "%(error)s") %
  5311. {'state': instance_state, 'error': error},
  5312. instance_uuid=instance_uuid)
  5313. self._instance_update(context, instance_uuid,
  5314. vm_state=instance_state,
  5315. task_state=None)
  5316. except exception.InstanceFaultRollback as error:
  5317. if quotas:
  5318. quotas.rollback()
  5319. LOG.info(_("Setting instance back to ACTIVE after: %s"),
  5320. error, instance_uuid=instance_uuid)
  5321. self._instance_update(context, instance_uuid,
  5322. vm_state=vm_states.ACTIVE,
  5323. task_state=None)
  5324. raise error.inner_exception
  5325. except Exception:
  5326. LOG.exception(_LE('Setting instance vm_state to ERROR'),
  5327. instance_uuid=instance_uuid)
  5328. with excutils.save_and_reraise_exception():
  5329. if quotas:
  5330. quotas.rollback()
  5331. self._set_instance_error_state(context, instance)
  5332. @aggregate_object_compat
  5333. @wrap_exception()
  5334. def add_aggregate_host(self, context, aggregate, host, slave_info):
  5335. """Notify hypervisor of change (for hypervisor pools)."""
  5336. try:
  5337. self.driver.add_to_aggregate(context, aggregate, host,
  5338. slave_info=slave_info)
  5339. except NotImplementedError:
  5340. LOG.debug('Hypervisor driver does not support '
  5341. 'add_aggregate_host')
  5342. except exception.AggregateError:
  5343. with excutils.save_and_reraise_exception():
  5344. self.driver.undo_aggregate_operation(
  5345. context,
  5346. aggregate.delete_host,
  5347. aggregate, host)
  5348. @aggregate_object_compat
  5349. @wrap_exception()
  5350. def remove_aggregate_host(self, context, host, slave_info, aggregate):
  5351. """Removes a host from a physical hypervisor pool."""
  5352. try:
  5353. self.driver.remove_from_aggregate(context, aggregate, host,
  5354. slave_info=slave_info)
  5355. except NotImplementedError:
  5356. LOG.debug('Hypervisor driver does not support '
  5357. 'remove_aggregate_host')
  5358. except (exception.AggregateError,
  5359. exception.InvalidAggregateAction) as e:
  5360. with excutils.save_and_reraise_exception():
  5361. self.driver.undo_aggregate_operation(
  5362. context,
  5363. aggregate.add_host,
  5364. aggregate, host,
  5365. isinstance(e, exception.AggregateError))
  5366. def _process_instance_event(self, instance, event):
  5367. _event = self.instance_events.pop_instance_event(instance, event)
  5368. if _event:
  5369. LOG.debug('Processing event %(event)s',
  5370. {'event': event.key}, instance=instance)
  5371. _event.send(event)
  5372. @wrap_exception()
  5373. def external_instance_event(self, context, instances, events):
  5374. # NOTE(danms): Some event types are handled by the manager, such
  5375. # as when we're asked to update the instance's info_cache. If it's
  5376. # not one of those, look for some thread(s) waiting for the event and
  5377. # unblock them if so.
  5378. for event in events:
  5379. instance = [inst for inst in instances
  5380. if inst.uuid == event.instance_uuid][0]
  5381. LOG.debug('Received event %(event)s',
  5382. {'event': event.key},
  5383. instance=instance)
  5384. if event.name == 'network-changed':
  5385. self.network_api.get_instance_nw_info(context, instance)
  5386. else:
  5387. self._process_instance_event(instance, event)
  5388. @periodic_task.periodic_task(spacing=CONF.image_cache_manager_interval,
  5389. external_process_ok=True)
  5390. def _run_image_cache_manager_pass(self, context):
  5391. """Run a single pass of the image cache manager."""
  5392. if not self.driver.capabilities["has_imagecache"]:
  5393. return
  5394. # Determine what other nodes use this storage
  5395. storage_users.register_storage_use(CONF.instances_path, CONF.host)
  5396. nodes = storage_users.get_storage_users(CONF.instances_path)
  5397. # Filter all_instances to only include those nodes which share this
  5398. # storage path.
  5399. # TODO(mikal): this should be further refactored so that the cache
  5400. # cleanup code doesn't know what those instances are, just a remote
  5401. # count, and then this logic should be pushed up the stack.
  5402. filters = {'deleted': False,
  5403. 'soft_deleted': True,
  5404. 'host': nodes}
  5405. filtered_instances = objects.InstanceList.get_by_filters(context,
  5406. filters, expected_attrs=[], use_slave=True)
  5407. self.driver.manage_image_cache(context, filtered_instances)
  5408. @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
  5409. def _run_pending_deletes(self, context):
  5410. """Retry any pending instance file deletes."""
  5411. LOG.debug('Cleaning up deleted instances')
  5412. filters = {'deleted': True,
  5413. 'soft_deleted': False,
  5414. 'host': CONF.host,
  5415. 'cleaned': False}
  5416. attrs = ['info_cache', 'security_groups', 'system_metadata']
  5417. with utils.temporary_mutation(context, read_deleted='yes'):
  5418. instances = objects.InstanceList.get_by_filters(
  5419. context, filters, expected_attrs=attrs, use_slave=True)
  5420. LOG.debug('There are %d instances to clean', len(instances))
  5421. for instance in instances:
  5422. attempts = int(instance.system_metadata.get('clean_attempts', '0'))
  5423. LOG.debug('Instance has had %(attempts)s of %(max)s '
  5424. 'cleanup attempts',
  5425. {'attempts': attempts,
  5426. 'max': CONF.maximum_instance_delete_attempts},
  5427. instance=instance)
  5428. if attempts < CONF.maximum_instance_delete_attempts:
  5429. success = self.driver.delete_instance_files(instance)
  5430. instance.system_metadata['clean_attempts'] = str(attempts + 1)
  5431. if success:
  5432. instance.cleaned = True
  5433. with utils.temporary_mutation(context, read_deleted='yes'):
  5434. instance.save(context)