diff --git a/releasenotes/notes/derive-deployment-parameters-c5e97d3df9bfc114.yaml b/releasenotes/notes/derive-deployment-parameters-c5e97d3df9bfc114.yaml new file mode 100644 index 000000000..50bbcb561 --- /dev/null +++ b/releasenotes/notes/derive-deployment-parameters-c5e97d3df9bfc114.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Add a Mistral workflow that uses hardware introspection data to derive + deployment parameters for features such as DPDK and HCI (hyperconverged + Nova compute and Ceph OSD nodes). The derived parameters workflow is + automatically invoked during deployment when the workflow is listed in + the plan environment file. + + For each role in the deployment, the workflow analyzes the Heat resource + tree to determine which features are relevant to that role. The main + workflow invokes secondary workflows responsible for deriving parameters + associated with each feature. diff --git a/workbooks/derive_params.yaml b/workbooks/derive_params.yaml index cf815f47f..735d3f80e 100644 --- a/workbooks/derive_params.yaml +++ b/workbooks/derive_params.yaml @@ -199,9 +199,12 @@ workflows: action: baremetal_introspection.get_data uuid=<% $.profile_node_uuid %> publish: hw_data: <% task().result %> + # Establish an empty dictionary of derived_parameters prior to + # invoking the individual "feature" algorithms + derived_parameters: <% dict() %> on-success: - - get_dpdk_derive_params: <% $.role_features.contains("DPDK") %> - # TODO: Needs to include condition to call other service derive params if DPDK is not available. + - get_dpdk_derive_params: <% $.role_features.contains('DPDK') %> + - get_hci_derive_params: <% not $.role_features.contains('DPDK') and $.role_features.contains('HCI') %> on-error: set_status_failed_get_introspection_data get_dpdk_derive_params: @@ -225,8 +228,23 @@ workflows: derived_parameters: <% $.derived_parameters %> publish: derived_parameters: <% task().result.get('derived_parameters', {}) %> + on-success: + - get_hci_derive_params: <% $.role_features.contains('HCI') %> on-error: set_status_failed_get_host_derive_params - # Workflow ends here because there are no more algorithms. + + get_hci_derive_params: + workflow: tripleo.derive_params_formulas.v1.hci_derive_params + input: + role_name: <% $.role_name %> + environment_parameters: <% $.environment_parameters %> + heat_resource_tree: <% $.heat_resource_tree %> + introspection_data: <% $.hw_data %> + user_inputs: <% $.user_inputs %> + derived_parameters: <% $.derived_parameters %> + publish: + derived_parameters: <% task().result.get('derived_parameters', {}) %> + on-error: set_status_failed_get_hci_derive_params + # Done (no more derived parameter features) set_status_failed_get_role_info: publish: @@ -284,6 +302,13 @@ workflows: message: <% task(get_host_derive_params).result %> on-success: fail + set_status_failed_get_hci_derive_params: + publish: + role_name: <% $.role_name %> + status: FAILED + message: <% task(get_hci_derive_params).result %> + on-success: fail + _get_role_info: description: > diff --git a/workbooks/derive_params_formulas.yaml b/workbooks/derive_params_formulas.yaml index c42a28ac4..acf272665 100644 --- a/workbooks/derive_params_formulas.yaml +++ b/workbooks/derive_params_formulas.yaml @@ -39,7 +39,7 @@ workflows: dpdk_nics_numa_info: <% task().result %> on-success: # TODO: Need to remove condtions here - # adding condition and trhow error in action for empty check + # adding condition and throw error in action for empty check - get_dpdk_nics_numa_nodes: <% $.dpdk_nics_numa_info %> - set_status_failed_get_dpdk_nics_numa_info: <% not $.dpdk_nics_numa_info %> on-error: set_status_failed_on_error_get_dpdk_nics_numa_info @@ -340,3 +340,215 @@ workflows: status: FAILED message: 'Unable to determine huge pages' on-success: fail + + + hci_derive_params: + description: Derive the deployment parameters for HCI + input: + - role_name + - environment_parameters + - heat_resource_tree + - introspection_data + - user_inputs + - derived_parameters: {} + + output: + derived_parameters: <% $.derived_parameters.mergeWith($.get('hci_parameters', {})) %> + + tasks: + get_hci_inputs: + publish: + hci_profile: <% $.user_inputs.get('hci_profile', '') %> + hci_profile_config: <% $.user_inputs.get('hci_profile_config', {}) %> + MB_PER_GB: 1024 + on-success: + - get_average_guest_memory_size_in_mb: <% $.hci_profile and $.hci_profile_config.get($.hci_profile, {}) %> + - set_failed_invalid_hci_profile: <% $.hci_profile and not $.hci_profile_config.get($.hci_profile, {}) %> + # When no hci_profile is specified, the workflow terminates without deriving any HCI parameters. + + get_average_guest_memory_size_in_mb: + publish: + average_guest_memory_size_in_mb: <% $.hci_profile_config.get($.hci_profile, {}).get('average_guest_memory_size_in_mb', 0) %> + on-success: + - get_average_guest_cpu_utilization_percentage: <% isInteger($.average_guest_memory_size_in_mb) %> + - set_failed_invalid_average_guest_memory_size_in_mb: <% not isInteger($.average_guest_memory_size_in_mb) %> + + get_average_guest_cpu_utilization_percentage: + publish: + average_guest_cpu_utilization_percentage: <% $.hci_profile_config.get($.hci_profile, {}).get('average_guest_cpu_utilization_percentage', 0) %> + on-success: + - get_gb_overhead_per_guest: <% isInteger($.average_guest_cpu_utilization_percentage) %> + - set_failed_invalid_average_guest_cpu_utilization_percentage: <% not isInteger($.average_guest_cpu_utilization_percentage) %> + + get_gb_overhead_per_guest: + publish: + gb_overhead_per_guest: <% $.user_inputs.get('gb_overhead_per_guest', 0.5) %> + on-success: + - get_gb_per_osd: <% isNumber($.gb_overhead_per_guest) %> + - set_failed_invalid_gb_overhead_per_guest: <% not isNumber($.gb_overhead_per_guest) %> + + get_gb_per_osd: + publish: + gb_per_osd: <% $.user_inputs.get('gb_per_osd', 3) %> + on-success: + - get_cores_per_osd: <% isNumber($.gb_per_osd) %> + - set_failed_invalid_gb_per_osd: <% not isNumber($.gb_per_osd) %> + + get_cores_per_osd: + publish: + cores_per_osd: <% $.user_inputs.get('cores_per_osd', 1.0) %> + on-success: + - get_extra_configs: <% isNumber($.cores_per_osd) %> + - set_failed_invalid_cores_per_osd: <% not isNumber($.cores_per_osd) %> + + get_extra_configs: + publish: + extra_config: <% $.environment_parameters.get('ExtraConfig', {}) %> + role_extra_config: <% $.environment_parameters.get(concat($.role_name, 'ExtraConfig'), {}) %> + on-success: get_num_osds + + get_num_osds: + publish: + num_osds: <% $.role_extra_config.get('ceph::profile::params::osds', $.extra_config.get('ceph::profile::params::osds', {})).keys().count() %> + on-success: + - get_memory_mb: <% $.num_osds %> + - set_failed_no_osds: <% not $.num_osds %> + + get_memory_mb: + publish: + memory_mb: <% $.introspection_data.get('memory_mb', 0) %> + on-success: + - get_num_cores: <% $.memory_mb %> + - set_failed_get_memory_mb: <% not $.memory_mb %> + + get_num_cores: + publish: + # TODO(abishop): If NovaVcpuPinSet is defined then use it to determine num_cores + num_cores: <% $.introspection_data.get('cpus', 0) %> + on-success: + - calculate_nova_parameters: <% $.num_cores %> + - set_failed_get_num_cores: <% not $.num_cores %> + + # HCI calculations are broken into multiple steps. This is necessary + # because variables published by a Mistral task are not available + # for use by that same task. Variables computed and published in a task + # are only available in subsequent tasks. + # + # The HCI calculations compute two Nova parameters: + # - reserved_host_memory + # - cpu_allocation_ratio + # + # The reserved_host_memory calculation computes the amount of memory + # that needs to be reserved for Ceph and the total amount of "guest + # overhead" memory that is based on the anticipated number of guests. + # Psuedo-code for the calculation (disregarding MB and GB units) is + # as follows: + # + # ceph_memory = mem_per_osd * num_osds + # nova_memory = total_memory - ceph_memory + # num_guests = nova_memory / + # (average_guest_memory_size + overhead_per_guest) + # reserved_memory = ceph_memory + (num_guests * overhead_per_guest) + # + # The cpu_allocation_ratio calculation is similar in that it takes into + # account the number of cores that must be reserved for Ceph. + # + # ceph_cores = cores_per_osd * num_osds + # guest_cores = num_cores - ceph_cores + # guest_vcpus = guest_cores / average_guest_utilization + # cpu_allocation_ratio = guest_vcpus / num_cores + + calculate_nova_parameters: + publish: + avg_guest_util: <% $.average_guest_cpu_utilization_percentage / 100.0 %> + avg_guest_size_gb: <% $.average_guest_memory_size_in_mb / float($.MB_PER_GB) %> + memory_gb: <% $.memory_mb / float($.MB_PER_GB) %> + ceph_mem_gb: <% $.gb_per_osd * $.num_osds %> + nonceph_cores: <% $.num_cores - int($.cores_per_osd * $.num_osds) %> + on-success: calc_step_2 + + calc_step_2: + publish: + num_guests: <% int(($.memory_gb - $.ceph_mem_gb) / ($.avg_guest_size_gb + $.gb_overhead_per_guest)) %> + guest_vcpus: <% $.nonceph_cores / $.avg_guest_util %> + on-success: calc_step_3 + + calc_step_3: + publish: + reserved_host_memory: <% $.MB_PER_GB * int($.ceph_mem_gb + ($.num_guests * $.gb_overhead_per_guest)) %> + cpu_allocation_ratio: <% $.guest_vcpus / $.num_cores %> + on-success: validate_results + + validate_results: + publish: + # Verify whether HCI is viable: + # - At least 80% of the memory is reserved for Ceph and guest overhead + # - At least half of the CPU cores must be available to Nova + mem_ok: <% $.reserved_host_memory <= ($.memory_mb * 0.8) %> + cpu_ok: <% $.cpu_allocation_ratio >= 0.5 %> + on-success: + - set_failed_insufficient_mem: <% not $.mem_ok %> + - set_failed_insufficient_cpu: <% not $.cpu_ok %> + - publish_hci_parameters: <% $.mem_ok and $.cpu_ok %> + + publish_hci_parameters: + publish: + # TODO(abishop): Update this when the cpu_allocation_ratio can be set + # via a THT parameter (no such parameter currently exists). Until a + # THT parameter exists, use hiera data to set the cpu_allocation_ratio. + hci_parameters: <% dict(concat($.role_name, 'Parameters') => dict('NovaReservedHostMemory' => $.reserved_host_memory)) + dict(concat($.role_name, 'ExtraConfig') => dict('nova::cpu_allocation_ratio' => $.cpu_allocation_ratio)) %> + + set_failed_invalid_hci_profile: + publish: + message: "'<% $.hci_profile %>' is not a valid HCI profile." + on-success: fail + + set_failed_invalid_average_guest_memory_size_in_mb: + publish: + message: "'<% $.average_guest_memory_size_in_mb %>' is not a valid average_guest_memory_size_in_mb value." + on-success: fail + + set_failed_invalid_gb_overhead_per_guest: + publish: + message: "'<% $.gb_overhead_per_guest %>' is not a valid gb_overhead_per_guest value." + on-success: fail + + set_failed_invalid_gb_per_osd: + publish: + message: "'<% $.gb_per_osd %>' is not a valid gb_per_osd value." + on-success: fail + + set_failed_invalid_cores_per_osd: + publish: + message: "'<% $.cores_per_osd %>' is not a valid cores_per_osd value." + on-success: fail + + set_failed_invalid_average_guest_cpu_utilization_percentage: + publish: + message: "'<% $.average_guest_cpu_utilization_percentage %>' is not a valid average_guest_cpu_utilization_percentage value." + on-success: fail + + set_failed_no_osds: + publish: + message: "No Ceph OSDs found in the overcloud definition ('ceph::profile::params::osds')." + on-success: fail + + set_failed_get_memory_mb: + publish: + message: "Unable to determine the amount of physical memory (no 'memory_mb' found in introspection_data)." + on-success: fail + + set_failed_get_num_cores: + publish: + message: "Unable to determine the number of CPU cores (no 'cpus' found in introspection_data)." + on-success: fail + + set_failed_insufficient_mem: + publish: + message: "<% $.memory_mb %> MB is not enough memory to run hyperconverged." + on-success: fail + + set_failed_insufficient_cpu: + publish: + message: "<% $.num_cores %> CPU cores are not enough to run hyperconverged." + on-success: fail