diff --git a/deployment/nova/nova-compute-container-puppet.yaml b/deployment/nova/nova-compute-container-puppet.yaml index 6926148d8c..745fb30f67 100644 --- a/deployment/nova/nova-compute-container-puppet.yaml +++ b/deployment/nova/nova-compute-container-puppet.yaml @@ -74,6 +74,14 @@ parameters: type: json tags: - role_specific + NovaComputeStartupDelay: + default: 0 + description: Delays the startup of nova-compute service after compute node is + booted. This is to give a chance to ceph to get back healthy + before booting instances after and overcloud reboot. + type: number + constraints: + - range: { min: 0, max: 600 } EnableInstanceHA: default: false description: Whether to enable an Instance Ha configurarion or not. @@ -706,6 +714,11 @@ resources: conditions: enable_instance_ha: {equals: [{get_param: EnableInstanceHA}, true]} + compute_startup_delay: + and: + - not: {equals: [{get_param: NovaComputeStartupDelay}, 0]} + - not: enable_instance_ha + enable_live_migration_tunnelled: or: @@ -865,6 +878,12 @@ outputs: list_join: - ' ' - - if: + - compute_startup_delay + - str_replace: + template: '/var/lib/nova/delay-nova-compute --delay DELAY --nova-binary' + params: { DELAY: {get_param: NovaComputeStartupDelay} } + - '' + - if: - enable_instance_ha - /var/lib/nova/instanceha/check-run-nova-compute - /usr/bin/nova-compute @@ -1131,6 +1150,15 @@ outputs: - name: If instance HA is enabled on the node activate the evacuation completed check file: path=/var/lib/nova/instanceha/enabled state=touch when: iha_nodes.stdout|lower is search('"'+ansible_hostname|lower+'"') + - name: Do we prepend nova startup with a delay + set_fact: + nova_compute_delay: {get_param: NovaComputeStartupDelay} + - name: install nova-compute delay wrapper script + copy: + content: {get_file: ../../scripts/delay-nova-compute} + dest: /var/lib/nova/delay-nova-compute + mode: 0755 + when: nova_compute_delay|int > 0 - name: Is irqbalance enabled set_fact: compute_irqbalance_disabled: {get_attr: [RoleParametersValue, value, compute_disable_irqbalance]} diff --git a/releasenotes/notes/nova-compute-startup-delay-fdb1f229840bd0e6.yaml b/releasenotes/notes/nova-compute-startup-delay-fdb1f229840bd0e6.yaml new file mode 100644 index 0000000000..b62f1f89f3 --- /dev/null +++ b/releasenotes/notes/nova-compute-startup-delay-fdb1f229840bd0e6.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + The parameter ``NovaComputeStartupDelay`` allows the operator to delay the + startup of ``nova-compute`` after a compute node reboot. + When all the overcloud nodes are rebooted at the same time, it can take a + few minutes to the Ceph cluster to get in a healthy state. This delay will + prevent the instances from booting before the Ceph cluster is healthy. diff --git a/roles/ComputeHCI.yaml b/roles/ComputeHCI.yaml index 842d28a9b7..1adfca660a 100644 --- a/roles/ComputeHCI.yaml +++ b/roles/ComputeHCI.yaml @@ -17,6 +17,7 @@ subnet: storage_mgmt_subnet RoleParametersDefault: TunedProfileName: "throughput-performance" + NovaComputeStartupDelay: 180 # CephOSD present so serial has to be 1 update_serial: 1 ServicesDefault: diff --git a/roles/ComputeHCIOvsDpdk.yaml b/roles/ComputeHCIOvsDpdk.yaml index 2361016d5f..332f5dddcd 100644 --- a/roles/ComputeHCIOvsDpdk.yaml +++ b/roles/ComputeHCIOvsDpdk.yaml @@ -23,6 +23,7 @@ VhostuserSocketGroup: "hugetlbfs" NovaLibvirtRxQueueSize: 1024 NovaLibvirtTxQueueSize: 1024 + NovaComputeStartupDelay: 180 ServicesDefault: - OS::TripleO::Services::Aide - OS::TripleO::Services::AuditD diff --git a/roles/ComputeHCISriov.yaml b/roles/ComputeHCISriov.yaml index c49a57d48e..2a9a8b8da3 100644 --- a/roles/ComputeHCISriov.yaml +++ b/roles/ComputeHCISriov.yaml @@ -17,6 +17,7 @@ subnet: storage_mgmt_subnet RoleParametersDefault: TunedProfileName: "cpu-partitioning" + NovaComputeStartupDelay: 180 # CephOSD present so serial has to be 1 update_serial: 1 ServicesDefault: diff --git a/scripts/delay-nova-compute b/scripts/delay-nova-compute new file mode 100644 index 0000000000..c7d226a6ef --- /dev/null +++ b/scripts/delay-nova-compute @@ -0,0 +1,45 @@ +#!/usr/libexec/platform-python +""" +This wrapper was created to add an optional delay to the startup of nova-compute. +We know that instances will fail to boot, after a compute reboot, if ceph is not +healthy. + +Ideally, we would poll ceph to get its health, but it's not guaranteed that the +compute node will have access to the keys. +""" + +import os +import sys +import time +import logging +import argparse + +parser = argparse.ArgumentParser(description='Process some integers.') +parser.add_argument('--config-file', dest='nova_config', action='store', + default="/etc/nova/nova.conf", + help='path to nova configuration (default: /etc/nova/nova.conf)') +parser.add_argument('--nova-binary', dest='nova_binary', action='store', + default="/usr/bin/nova-compute", + help='path to nova compute binary (default: /usr/bin/nova-compute)') +parser.add_argument('--delay', dest='delay', action='store', + default=120, type=int, + help='Number of seconds to wait until nova-compute is started') +parser.add_argument('--state-file', dest='state_file', action='store', + default="/run/nova-compute-delayed", + help='file exists if we already delayed nova-compute startup'\ + '(default: /run/nova-compute-delayed)') + + +sections = {} +(args, remaining) = parser.parse_known_args(sys.argv) + +real_args = [args.nova_binary, '--config-file', args.nova_config] +real_args.extend(remaining[1:]) + +if not os.path.isfile(args.state_file): + logging.info("Delaying nova-compute startup by %s seconds" % args.delay) + time.sleep(args.delay) + open(args.state_file, 'a').close() + +logging.info("Executing %s" % real_args) +os.execv(args.nova_binary, real_args)