Add cluster health task with disk monitor
This change adds a monitor into corosync/pacemaker to migrate services if the monitored disks drop below 100M free. Once the operator has resolved the full disk, they must clear the alarm by running: crm node status-attr <hostname> delete "#health_disk" After the alarm has been cleared, the services should be automatically restarted. This change is not a replacement for proper monitoring, but it will properly shut down and migrate services if a controller runs out of disk space. DocImpact Closes-Bug: 1493520 Change-Id: I8a2cb4bd8d0b6070400d13e25d2310f4777b9faf
This commit is contained in:
parent
2f1b1954de
commit
03e7683381
70
deployment/puppet/cluster/manifests/sysinfo.pp
Normal file
70
deployment/puppet/cluster/manifests/sysinfo.pp
Normal file
@ -0,0 +1,70 @@
|
||||
# == Class: cluster::sysinfo
|
||||
#
|
||||
# Configure pacemaker sysinfo disk monitor
|
||||
#
|
||||
# === Parameters
|
||||
#
|
||||
# [*primary_controller*]
|
||||
# (required) Boolean to indicate if this is the primary controller or not. The
|
||||
# resources only get defined on the primary controller for the cluster but the
|
||||
# location is defined on any node on this cluster that this should run on.
|
||||
#
|
||||
# [*disks*]
|
||||
# (optional) array of mount points to monitor for free space. / is monitored
|
||||
# by default it does not need to be specified.
|
||||
# Defaults to []
|
||||
#
|
||||
# [*min_disk_free*]
|
||||
# (optional) Minimum amount of free space required for the paritions
|
||||
# Defaults to '100M'
|
||||
#
|
||||
# [*disk_unit*]
|
||||
# (optional) Unit for disk space
|
||||
# Defaults to 'M'
|
||||
#
|
||||
# [*monitor_interval*]
|
||||
# (optional) Internval to monitor free space
|
||||
# Defaults to '60s'
|
||||
#
|
||||
# [*monitor_ensure*]
|
||||
# (optional) Ensure the corosync monitor is installed
|
||||
# Defaults to present
|
||||
#
|
||||
class cluster::sysinfo (
|
||||
$primary_controller,
|
||||
$disks = [],
|
||||
$min_disk_free = '100M',
|
||||
$disk_unit = 'M',
|
||||
$monitor_interval = '15s',
|
||||
$monitor_ensure = present,
|
||||
) {
|
||||
|
||||
if $primary_controller {
|
||||
cs_resource { 'sysinfo':
|
||||
ensure => $monitor_ensure,
|
||||
primitive_class => 'ocf',
|
||||
provided_by => 'pacemaker',
|
||||
primitive_type => 'SysInfo',
|
||||
complex_type => 'clone',
|
||||
parameters => {
|
||||
'disks' => join(any2array($disks), ' '),
|
||||
'min_disk_free' => $min_disk_free,
|
||||
'disk_unit' => $disk_unit,
|
||||
},
|
||||
operations => { 'monitor' => { 'interval' => $monitor_interval } },
|
||||
}
|
||||
|
||||
# Have service migrate if health turns red from the failed disk check
|
||||
cs_property { 'node-health-strategy':
|
||||
ensure => present,
|
||||
value => 'migrate-on-red',
|
||||
provider => 'crm',
|
||||
}
|
||||
}
|
||||
|
||||
cs_location { "clone_sysinfo-on-${::fqdn}":
|
||||
primitive => 'clone_sysinfo',
|
||||
node_name => $::fqdn,
|
||||
score => 'INFINITY',
|
||||
}
|
||||
}
|
25
deployment/puppet/osnailyfacter/lib/facter/mounts.rb
Normal file
25
deployment/puppet/osnailyfacter/lib/facter/mounts.rb
Normal file
@ -0,0 +1,25 @@
|
||||
#
|
||||
# This fact returns the currently mounted ext{2,3,4}, xfs or btrfs disks as a
|
||||
# comma seperate list.
|
||||
#
|
||||
|
||||
mounts = []
|
||||
case Facter.value(:kernel)
|
||||
when 'Linux'
|
||||
include_filesystems = ['ext[2-4]', 'xfs', 'btrfs']
|
||||
filesystems_re = Regexp.new(include_filesystems.join('|'))
|
||||
File.open('/proc/mounts').each do |line|
|
||||
mount = line.split(' ')[1] if filesystems_re.match(line)
|
||||
# if for some reason the mount line is not properly formated, this
|
||||
# prevents nil from being added to the mounts. For example a line that
|
||||
# only has 'xfs' would return nil
|
||||
mounts << mount unless mount.nil?
|
||||
end
|
||||
end
|
||||
|
||||
Facter.add(:mounts) do
|
||||
setcode do
|
||||
mounts.join(",")
|
||||
end
|
||||
end
|
||||
|
22
deployment/puppet/osnailyfacter/modular/cluster/health.pp
Normal file
22
deployment/puppet/osnailyfacter/modular/cluster/health.pp
Normal file
@ -0,0 +1,22 @@
|
||||
notice('MODULAR: cluster/health.pp')
|
||||
|
||||
if !(hiera('role') in hiera('corosync_roles')) {
|
||||
fail('The node role is not in corosync roles')
|
||||
}
|
||||
|
||||
# load the mounted filesystems from our custom fact, remove boot
|
||||
$mount_points = delete(split($::mounts, ','), '/boot')
|
||||
|
||||
$primary_controller = hiera('primary_controller')
|
||||
$disks = hiera('corosync_disks', $mount_points)
|
||||
$min_disk_free = hiera('corosync_min_disk_space', '100M')
|
||||
$disk_unit = hiera('corosync_disk_unit', 'M')
|
||||
$monitor_interval = hiera('corosync_disk_monitor_interval', '30s')
|
||||
|
||||
class { 'cluster::sysinfo':
|
||||
primary_controller => $primary_controller,
|
||||
disks => $disks,
|
||||
min_disk_free => $min_disk_free,
|
||||
disk_unit => $disk_unit,
|
||||
monitor_interval => $monitor_interval,
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
require File.join File.dirname(__FILE__), '../test_common.rb'
|
||||
|
||||
class HealthPostTest < Test::Unit::TestCase
|
||||
def ubuntu?
|
||||
TestCommon::Facts.operatingsystem == 'Ubuntu'
|
||||
end
|
||||
|
||||
def test_sysinfo_resource_started
|
||||
return unless ubuntu?
|
||||
assert TestCommon::Pacemaker.primitive_present?('sysinfo'), 'sysinfo is not created!'
|
||||
end
|
||||
end
|
||||
|
@ -9,3 +9,15 @@
|
||||
timeout: 3600
|
||||
test_post:
|
||||
cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/cluster_post.rb
|
||||
|
||||
- id: cluster_health
|
||||
type: puppet
|
||||
groups: [primary-controller, controller]
|
||||
required_for: [deploy_end]
|
||||
requires: [cluster]
|
||||
parameters:
|
||||
puppet_manifest: /etc/puppet/modules/osnailyfacter/modular/cluster/health.pp
|
||||
puppet_modules: /etc/puppet/modules
|
||||
timeout: 600
|
||||
test_post:
|
||||
cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/health_post.rb
|
||||
|
45
tests/noop/spec/hosts/cluster/health_spec.rb
Normal file
45
tests/noop/spec/hosts/cluster/health_spec.rb
Normal file
@ -0,0 +1,45 @@
|
||||
require 'spec_helper'
|
||||
require 'shared-examples'
|
||||
manifest = 'cluster/health.pp'
|
||||
|
||||
describe manifest do
|
||||
shared_examples 'catalog' do
|
||||
let(:primary_controller) do
|
||||
Noop.hiera 'primary_controller'
|
||||
end
|
||||
let(:facts) {
|
||||
Noop.ubuntu_facts.merge({
|
||||
:mounts => '/,/boot,/var/log,/var/lib/glance,/var/lib/mysql'
|
||||
})
|
||||
}
|
||||
|
||||
let(:disks) do
|
||||
Noop.hiera 'corosync_disk_monitor', ['/', '/var/log', '/var/lib/glance', '/var/lib/mysql']
|
||||
end
|
||||
|
||||
let(:min_disk_free) do
|
||||
Noop.hiera 'corosync_min_disk_space', '100M'
|
||||
end
|
||||
|
||||
let(:disk_unit) do
|
||||
Noop.hiera 'corosync_disk_unit', 'M'
|
||||
end
|
||||
|
||||
let(:monitor_interval) do
|
||||
Noop.hiera 'corosync_monitor_interval', '30s'
|
||||
end
|
||||
|
||||
it {
|
||||
should contain_class('cluster::sysinfo').with(
|
||||
:primary_controller => primary_controller,
|
||||
:disks => disks,
|
||||
:min_disk_free => min_disk_free,
|
||||
:disk_unit => disk_unit,
|
||||
:monitor_interval => monitor_interval
|
||||
)
|
||||
}
|
||||
|
||||
end
|
||||
test_ubuntu_and_centos manifest
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user