Mtce: Make Multi-Node Failure Avoidance Configurable
The maintenance system implements a high availability (HA) feature designed to detect the simultaneous heartbeat failure of a group of hosts and avoid failing all those hosts until heartbeat resumes or after a set period of time. This feature is called Multi-Node Failure Avoidance, aka MNFA, and currently has the hosts threshold set to 3 and timeout set to 100 secs. This update implements enhancements to that existing feature by making the 'number-of-hosts threshold' and 'timeout period' customer configurable service parameters. The new service parameters are listed under platform:maintenance which display with the following command > system service-parameter-list mnfa_threshold: This new label and value is added to the puppet managed /etc/mtc.ini and represents the number of hosts that are required to fail heartbeat as a group; within the heartbeat failure window (heartbeat_failure_threshold) after which maintenance activates MNFA Mode. This update changes the default number of failing hosts from 3 to 2 while allowing a configurable range from 2 to 100. mnfa_timeout: This new label and value is added to the puppet managed /etc/mtc.ini. While MNFA mode is active, it will remain active until the number of failing hosts drop below the mnfa_threshold or this timer expires. The MNFA mode deactivates on the first occurance of either case. Upon deactivation the remaining failed hosts are no longer treated as a failure group but instead are all Gracefully Recovered individually. A value of zero imposes no timeout making the deactivation criteria solely host based. This update changes the default 100 second timer to 0; no-timeout while permitting valid a times range from 100 to 86400 secs or 1 day. DocImpact Story: 2003576 Task: 24903 Change-Id: I2fb737a4cd3c235845b064449949fcada303d6b2 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
4cbc961dc8
commit
f19dd0498f
@ -51,6 +51,8 @@ platform::mtce::agent::params::controller_boot_timeout: 1200
|
||||
platform::mtce::agent::params::heartbeat_period: 100
|
||||
platform::mtce::agent::params::heartbeat_failure_threshold: 10
|
||||
platform::mtce::agent::params::heartbeat_degrade_threshold: 6
|
||||
platform::mtce::agent::params::mnfa_threshold: 2
|
||||
platform::mtce::agent::params::mnfa_timeout: 0
|
||||
|
||||
# influxdb configuration for collectd
|
||||
platform::influxdb::params::bind_address: ':25826'
|
||||
|
@ -14,6 +14,8 @@ class platform::mtce::params (
|
||||
$heartbeat_failure_threshold = undef,
|
||||
$heartbeat_period = undef,
|
||||
$mtce_multicast = undef,
|
||||
$mnfa_threshold = undef,
|
||||
$mnfa_timeout = undef,
|
||||
) { }
|
||||
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
SRC_DIR="src"
|
||||
COPY_LIST="$SRC_DIR/LICENSE"
|
||||
TIS_PATCH_VER=6
|
||||
TIS_PATCH_VER=7
|
||||
|
@ -1,4 +1,4 @@
|
||||
; Packstack managed Maintenance Configuration
|
||||
; Puppet Managed Maintenance Configuration
|
||||
[agent] ; Agent Configuration
|
||||
keystone_auth_username = <%= @auth_username %> ; mtce auth username
|
||||
keystone_auth_pw = <%= @auth_pw %> ; mtce auth password
|
||||
@ -16,8 +16,31 @@ heartbeat_period = <%= @heartbeat_period %> ; Heartbeat period in milliseconds
|
||||
heartbeat_failure_threshold = <%= @heartbeat_failure_threshold %> ; Heartbeat failure threshold count.
|
||||
heartbeat_degrade_threshold = <%= @heartbeat_degrade_threshold %> ; Heartbeat degrade threshold count.
|
||||
|
||||
; Multi-Node Failure Avoidance (MNFA) Activation and Deactivation threshold.
|
||||
; The minimum number of hosts that fail heartbeat within the
|
||||
; heartbeat_failure_threshold upon which Maintenance activates MNFA Mode.
|
||||
; Once the number of failing hosts drop below this threshold then mainteance
|
||||
; deactivates MNFA mode while remaining failing hosts are Gracefully Recovered.
|
||||
; Default value is 2
|
||||
; Minimum value is 2
|
||||
; To modify execute:
|
||||
; system service-parameter-modify platform maintenance mnfa_threshold=<value>
|
||||
mnfa_threshold = <%= @mnfa_threshold %>
|
||||
|
||||
[timeouts]
|
||||
compute_boot_timeout = <%= @compute_boot_timeout %> ; The max time (seconds) that Mtce waits for the mtcAlive
|
||||
controller_boot_timeout = <%= @controller_boot_timeout %> ; message after which it will time out and fail the host.
|
||||
|
||||
|
||||
; Multi-Node Failure Avoidance (MNFA) Lifecycle Timer.
|
||||
; MNFA Activation starts a timer with this timeout value.
|
||||
; See mnfa_threshold above.
|
||||
; Maintenance automatically Deactivates MNFA mode if the number of hosts that
|
||||
; are failing heartbeat doesn't drop below mnfa_threshold before timer expires.
|
||||
; Timer is in seconds.
|
||||
; A zero value means infinite lifecycle or until the number of
|
||||
; heartbeat failing hosts drops below the mnfa_threshold before expiry.
|
||||
; Default value is 0
|
||||
; Minimum non-zero value is 100 ; maximum is 86400
|
||||
; To modify execute:
|
||||
; system service-parameter-modify platform maintenance mnfa_timeout=<value>
|
||||
mnfa_timeout = <%= @mnfa_timeout %>
|
||||
|
@ -944,12 +944,16 @@ SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT = 'controller_boot_timeout'
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD = 'heartbeat_period'
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD = 'heartbeat_failure_threshold'
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD = 'heartbeat_degrade_threshold'
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD = 'mnfa_threshold'
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT = 'mnfa_timeout'
|
||||
|
||||
SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT_DEFAULT = 720
|
||||
SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT_DEFAULT = 1200
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD_DEFAULT = 100
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_DEFAULT = 10
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_DEFAULT = 6
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_DEFAULT = 2
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_DEFAULT = 0
|
||||
|
||||
# Nova Service Parameters
|
||||
SERVICE_PARAM_SECTION_NOVA_PCI_ALIAS = 'pci_alias'
|
||||
|
@ -93,6 +93,18 @@ def _validate_range(name, value, min, max):
|
||||
"Parameter '%s' must be an integer value." % name))
|
||||
|
||||
|
||||
def _validate_zero_or_range(name, value, min, max):
|
||||
try:
|
||||
if int(value) != 0:
|
||||
if int(value) < min or int(value) > max:
|
||||
raise wsme.exc.ClientSideError(_(
|
||||
"Parameter '%s' must be zero or between %d and %d.")
|
||||
% (name, min, max))
|
||||
except ValueError:
|
||||
raise wsme.exc.ClientSideError(_(
|
||||
"Parameter '%s' must be an integer value." % name))
|
||||
|
||||
|
||||
def _validate_ldap_url(name, value):
|
||||
|
||||
url = urlparse.urlparse(value)
|
||||
@ -545,6 +557,19 @@ def _validate_hbs_degrade_threshold(name, value):
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MAX)
|
||||
|
||||
|
||||
def _validate_mnfa_threshold(name, value):
|
||||
_validate_range(name, value,
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MIN,
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MAX)
|
||||
|
||||
|
||||
def _validate_mnfa_timeout(name, value):
|
||||
# accept zero (no timeout) or a reasonable/tested specific range
|
||||
_validate_zero_or_range(name, value,
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MIN,
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MAX)
|
||||
|
||||
|
||||
# Validate range of Performance Monitoring Event 'time to live" value
|
||||
def _validate_event_time_to_live_range(name, value):
|
||||
_validate_range(name, value,
|
||||
@ -1308,6 +1333,8 @@ PLATFORM_MTCE_PARAMETER_MANDATORY = [
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT,
|
||||
]
|
||||
|
||||
PLATFORM_SYSINV_PARAMETER_PROTECTED = ['firewall_rules_id']
|
||||
@ -1322,6 +1349,10 @@ SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MIN = 10
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MAX = 100
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MIN = 4
|
||||
SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MAX = 100
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MIN = 2
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MAX = 100
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MIN = 100
|
||||
SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MAX = 86400
|
||||
|
||||
PLATFORM_MTCE_PARAMETER_VALIDATOR = {
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT:
|
||||
@ -1334,6 +1365,10 @@ PLATFORM_MTCE_PARAMETER_VALIDATOR = {
|
||||
_validate_hbs_failure_threshold,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD:
|
||||
_validate_hbs_degrade_threshold,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD:
|
||||
_validate_mnfa_threshold,
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT:
|
||||
_validate_mnfa_timeout,
|
||||
}
|
||||
|
||||
PLATFORM_MTCE_PARAMETER_RESOURCE = {
|
||||
@ -1342,6 +1377,8 @@ PLATFORM_MTCE_PARAMETER_RESOURCE = {
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD: 'platform::mtce::params::heartbeat_period',
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD: 'platform::mtce::params::heartbeat_failure_threshold',
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD: 'platform::mtce::params::heartbeat_degrade_threshold',
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD: 'platform::mtce::params::mnfa_threshold',
|
||||
constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT: 'platform::mtce::params::mnfa_timeout',
|
||||
}
|
||||
|
||||
# Panko Event TTL range from 1 hour to 1 year
|
||||
|
@ -486,6 +486,16 @@ class ConductorManager(service.PeriodicService):
|
||||
'name': constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD,
|
||||
'value': constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_DEFAULT,
|
||||
},
|
||||
{'service': constants.SERVICE_TYPE_PLATFORM,
|
||||
'section': constants.SERVICE_PARAM_SECTION_PLATFORM_MAINTENANCE,
|
||||
'name': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD,
|
||||
'value': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_DEFAULT,
|
||||
},
|
||||
{'service': constants.SERVICE_TYPE_PLATFORM,
|
||||
'section': constants.SERVICE_PARAM_SECTION_PLATFORM_MAINTENANCE,
|
||||
'name': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT,
|
||||
'value': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_DEFAULT,
|
||||
},
|
||||
{'service': constants.SERVICE_TYPE_PANKO,
|
||||
'section': constants.SERVICE_PARAM_SECTION_PANKO_DATABASE,
|
||||
'name': constants.SERVICE_PARAM_NAME_PANKO_DATABASE_EVENT_TIME_TO_LIVE,
|
||||
|
Loading…
Reference in New Issue
Block a user