Merge "enroll-init: reconfiguration flexibility"

This commit is contained in:
Zuul 2024-11-22 20:21:16 +00:00 committed by Gerrit Code Review
commit 7c511a7cd5
4 changed files with 176 additions and 113 deletions

View File

@ -0,0 +1,73 @@
---
#
# Copyright (c) 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This playbook executes the enroll-init stage of subcloud enrollment,
# using cloud-init on the target system and rvmc to insert a seed image.
# Overall, this triggers OAM network and password updates required for subsequent
# enrollment stages.
#
- name: Enroll Init Playbook
hosts: all
gather_facts: false
become: no
tasks:
- set_fact:
sysinv_port: "{{ sysinv_port | default(6385) }}"
boot_wait_time: "{{ enroll_boot_wait_time | default(150) }}"
wait_for_timeout: "{{ enroll_wait_for_timeout | default(900) }}"
job_retry_delay: "{{ 120 | random }}"
protocol: "{{ protocol | default('https') }}"
operation_string: "enroll-init"
- name: Run RVMC script to insert seed image and power on host
include_role:
name: common/rvmc
- debug:
msg: "Waiting for the system to enroll init..."
# Wait for the sysinv API to open, ensuring that endpoints
# are reconfigured with the new address.
- name: Waiting {{ wait_for_timeout }} seconds for port {{ sysinv_port }} become open on {{ enroll_reconfigured_oam }}
local_action:
module: wait_for
port={{ sysinv_port }}
host={{ enroll_reconfigured_oam }}
delay={{ boot_wait_time }}
timeout={{ wait_for_timeout }}
state=started
msg="Timeout waiting for {{ enroll_reconfigured_oam }}:{{ sysinv_port }}. Err_code=wait_enroll_init"
# The seed ISO must be ejected to prevent it from being reapplied on reboot.
# This is necessary because cloud-config is set to run always and cloud-init services
# remain enabled until enrollment is complete (allowing for the possibility
# of re-running enroll-init if needed)
- name: Run RVMC script to eject image
include_role:
name: common/rvmc
vars:
eject_image_only: true
# During the OAM update, several async operations may trigger one another.
# Although the endpoints are reconfigured by now, the API and certs may not
# be fully updated. A simple curl request can be used to verify both, specifically
# checking the region_id API, which will be needed shortly after this playbook completes.
- name: Wait for the sysinv API to be ready and for certs to be updated for the reconfigured OAM endpoint
shell: |
curl -s -o /dev/null -w '%{http_code}' \
{{ protocol }}://{{ enroll_reconfigured_oam }}:{{ sysinv_port }}/v1/isystems/region_id
register: api_response
retries: 30
delay: 20
until: api_response.stdout == "200"
delegate_to: localhost
args:
# Disable warning that suggests using the get_url and uri module:
# - get_url is unnecessary as we're not actually downloading.
# - uri module doesn't seem to work for our cert update check
warn: false

View File

@ -16,104 +16,14 @@
boot_wait_time: "{{ boot_wait_time | default(600) }}"
wait_for_timeout: "{{ wait_for_timeout | default(3600) }}"
job_retry_delay: "{{ 120 | random }}"
operation_string: "install"
host_to_check: "{{ ansible_host }}"
enroll_init: "{{ enroll_reconfigured_oam is defined }}"
- name: Set facts for enroll_init
set_fact:
# ansible_port is waited on for an open connection,
# set it to sysinv_api_port to ensure that endpoints have
# reconfigured before marking the playbook as complete.
ansible_port: 6385
operation_string: "enroll-init"
host_to_check: "{{ enroll_reconfigured_oam }}"
boot_wait_time: "{{ enroll_boot_wait_time | default(150) }}"
wait_for_timeout: "{{ enroll_wait_for_timeout | default(900) }}"
when: enroll_init
# The following block is executed locally
- block:
- name: Run rvmc script
script: >
/usr/local/bin/rvmc_install.py
--debug={{ rvmc_debug_level | default(0) }}
--subcloud_name="{{ inventory_hostname }}"
--config_file="{{ rvmc_config_file }}"
# Script return code:
# 0 - Success
# 1 - Retryable failures
# 2 - Non-retryable failures
# (eg. Invalid credentials, Script execution timeout in 30 minutes,
# Failed to terminate the previous process.)
# Refer to rvmc_install.py in the distributedcloud repository for details.
until: script_result.rc != 1
register: script_result
retries: 1
delay: "{{ job_retry_delay }}"
failed_when: false
- name: Display script output
debug:
msg: "{{ script_result.stdout }}"
# Block to isolate failure message
- block:
- name: Set credential failure flag
set_fact:
failure_msg: >-
Credential failure.
Action: Check BMC username and password in config file
when: '"Action: Check BMC username and password in config file" in script_result.stdout'
- name: Set output msg if BMC is unreachable.
set_fact:
failure_msg: >-
Ping to BMC has failed.
Check BMC values in install values file.
Ensure you can ssh into BMC using these credentials.
Err_code=ping_bmc
when: '"Action: Check BMC ip address is pingable" in script_result.stdout'
- name: Set output message if session creation fails
set_fact:
failure_msg: >-
Failed to connect to BMC.
Check BMC credentials in install values file.
Ensure you can ssh into BMC using these credentials.
Err_code=bmc_cred
when: '"Failed to Create session" in script_result.stdout'
- name: Set output message if the script execution times out
set_fact:
failure_msg: >-
BMC operations timed out.
Please review the script output to identify the operation
that is currently stuck.
Err_code=rvmc_timeout
when: '"RVMC script execution timed out" in script_result.stdout'
- name: Set output message if failed to terminate the previous RVMC process
set_fact:
failure_msg: >-
Failed to terminate the previous RVMC process.
Please review the script output to find out the previous
RVMC process ID.
Err_code=rvmc_process
when: '"Failed to terminate the previous process" in script_result.stdout'
- name: Fail if Redfish Virtual Media Controller returns an error
fail:
msg: |
Failed to {{ operation_string }} the host via Redfish Virtual Media Controller.
{{ failure_msg | default('The RVMC script exited with failure response or exception.') }}
when: script_result.rc != 0
delegate_to: localhost
- name: Run RVMC script to insert boot image and power on host
include_role:
name: common/rvmc
- debug:
msg: "Waiting for the system to {{ operation_string }}..."
msg: "Waiting for the system to install..."
- name: Waiting {{ wait_for_timeout }} seconds for port {{ ansible_port }} become open on {{ host_to_check }}
local_action:
@ -123,24 +33,7 @@
delay={{ boot_wait_time }}
timeout={{ wait_for_timeout }}
state=started
msg="Timeout waiting for {{ host_to_check }}:{{ ansible_port }}. Err_code=wait_{{ operation_string }}"
# TODO (srana): Now that enroll-init tasks diverge from install, consider separating enroll-init
# from the install playbook. The RVMC script block should be made common so both install and
# enroll-init can use it independently.
- name: Wait for sysinv API to be ready on the reconfigured OAM endpoint
local_action:
module: uri
url: "https://{{ host_to_check }}:{{ ansible_port }}/v1/isystems/region_id"
method: GET
validate_certs: no
register: api_response
retries: 10
delay: 6
until: api_response.status == 200
# The caller (DCManager) will try and report an error in case of failure
failed_when: false
when: enroll_init
msg="Timeout waiting for {{ host_to_check }}:{{ ansible_port }}. Err_code=wait_install"
- name: Run validate host playbook post install
import_playbook: validate_host.yml
@ -152,4 +45,3 @@
sync_patch_metadata: true
sync_software_metadata: true
enforce_password_change: true
when: not enroll_init

View File

@ -0,0 +1,94 @@
---
#
# Copyright (c) 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This role handles the execution of the RVMC script, including error handling
# and reporting. It is primarily used for subcloud remote installation and
# subcloud enrollment, where an image is inserted, and the subcloud is rebooted.
#
# The following block is executed locally
- block:
- name: Run rvmc script
script: >
/usr/local/bin/rvmc_install.py
--debug={{ rvmc_debug_level | default(0) }}
--subcloud_name="{{ inventory_hostname }}"
--config_file="{{ rvmc_config_file }}"
{% if eject_image_only | default(false) %}
--eject_image_only
{% endif %}
# Script return code:
# 0 - Success
# 1 - Retryable failures
# 2 - Non-retryable failures
# (eg. Invalid credentials, Script execution timeout in 30 minutes,
# Failed to terminate the previous process.)
# Refer to rvmc_install.py in the distributedcloud repository for details.
until: script_result.rc != 1
register: script_result
retries: 1
delay: "{{ job_retry_delay }}"
failed_when: false
- name: Display script output
debug:
msg: "{{ script_result.stdout }}"
# Block to isolate failure message
- block:
- name: Set credential failure flag
set_fact:
failure_msg: >-
Credential failure.
Action: Check BMC username and password in config file
when: '"Action: Check BMC username and password in config file" in script_result.stdout'
- name: Set output msg if BMC is unreachable.
set_fact:
failure_msg: >-
Ping to BMC has failed.
Check BMC values in install values file.
Ensure you can ssh into BMC using these credentials.
Err_code=ping_bmc
when: '"Action: Check BMC ip address is pingable" in script_result.stdout'
- name: Set output message if session creation fails
set_fact:
failure_msg: >-
Failed to connect to BMC.
Check BMC credentials in install values file.
Ensure you can ssh into BMC using these credentials.
Err_code=bmc_cred
when: '"Failed to Create session" in script_result.stdout'
- name: Set output message if the script execution times out
set_fact:
failure_msg: >-
BMC operations timed out.
Please review the script output to identify the operation
that is currently stuck.
Err_code=rvmc_timeout
when: '"RVMC script execution timed out" in script_result.stdout'
- name: Set output message if failed to terminate the previous RVMC process
set_fact:
failure_msg: >-
Failed to terminate the previous RVMC process.
Please review the script output to find out the previous
RVMC process ID.
Err_code=rvmc_process
when: '"Failed to terminate the previous process" in script_result.stdout'
- name: Fail if Redfish Virtual Media Controller returns an error
fail:
msg: |
Failed to {{ operation_string | default('install') }} the host via Redfish Virtual Media Controller.
{{ failure_msg | default('The RVMC script exited with failure response or exception.') }}
when: script_result.rc != 0
delegate_to: localhost

View File

@ -12,6 +12,10 @@
include_role:
name: common/check-services-status
- name: Cleanup the cloud-init services
command: /usr/local/bin/enroll-init-cleanup
become: yes
- name: Lock controller-0
include_role:
name: common/host-lock