Add cluster health task with disk monitor

This change adds a monitor into corosync/pacemaker to migrate services
if the monitored disks drop below 100M free.

Once the operator has resolved the full disk, they must clear the
alarm by running:

 crm node status-attr <hostname> delete "#health_disk"

After the alarm has been cleared, the services should be automatically
restarted.

This change is not a replacement for proper monitoring, but it will
properly shut down and migrate services if a controller runs out of disk
space.

DocImpact
Closes-Bug: 1493520

Change-Id: I8a2cb4bd8d0b6070400d13e25d2310f4777b9faf
This commit is contained in:
Alex Schultz 2015-09-21 16:29:56 -05:00
parent 2f1b1954de
commit 03e7683381
6 changed files with 187 additions and 0 deletions

View File

@ -0,0 +1,70 @@
# == Class: cluster::sysinfo
#
# Configure pacemaker sysinfo disk monitor
#
# === Parameters
#
# [*primary_controller*]
# (required) Boolean to indicate if this is the primary controller or not. The
# resources only get defined on the primary controller for the cluster but the
# location is defined on any node on this cluster that this should run on.
#
# [*disks*]
# (optional) array of mount points to monitor for free space. / is monitored
# by default it does not need to be specified.
# Defaults to []
#
# [*min_disk_free*]
# (optional) Minimum amount of free space required for the paritions
# Defaults to '100M'
#
# [*disk_unit*]
# (optional) Unit for disk space
# Defaults to 'M'
#
# [*monitor_interval*]
# (optional) Internval to monitor free space
# Defaults to '60s'
#
# [*monitor_ensure*]
# (optional) Ensure the corosync monitor is installed
# Defaults to present
#
class cluster::sysinfo (
$primary_controller,
$disks = [],
$min_disk_free = '100M',
$disk_unit = 'M',
$monitor_interval = '15s',
$monitor_ensure = present,
) {
if $primary_controller {
cs_resource { 'sysinfo':
ensure => $monitor_ensure,
primitive_class => 'ocf',
provided_by => 'pacemaker',
primitive_type => 'SysInfo',
complex_type => 'clone',
parameters => {
'disks' => join(any2array($disks), ' '),
'min_disk_free' => $min_disk_free,
'disk_unit' => $disk_unit,
},
operations => { 'monitor' => { 'interval' => $monitor_interval } },
}
# Have service migrate if health turns red from the failed disk check
cs_property { 'node-health-strategy':
ensure => present,
value => 'migrate-on-red',
provider => 'crm',
}
}
cs_location { "clone_sysinfo-on-${::fqdn}":
primitive => 'clone_sysinfo',
node_name => $::fqdn,
score => 'INFINITY',
}
}

View File

@ -0,0 +1,25 @@
#
# This fact returns the currently mounted ext{2,3,4}, xfs or btrfs disks as a
# comma seperate list.
#
mounts = []
case Facter.value(:kernel)
when 'Linux'
include_filesystems = ['ext[2-4]', 'xfs', 'btrfs']
filesystems_re = Regexp.new(include_filesystems.join('|'))
File.open('/proc/mounts').each do |line|
mount = line.split(' ')[1] if filesystems_re.match(line)
# if for some reason the mount line is not properly formated, this
# prevents nil from being added to the mounts. For example a line that
# only has 'xfs' would return nil
mounts << mount unless mount.nil?
end
end
Facter.add(:mounts) do
setcode do
mounts.join(",")
end
end

View File

@ -0,0 +1,22 @@
notice('MODULAR: cluster/health.pp')
if !(hiera('role') in hiera('corosync_roles')) {
fail('The node role is not in corosync roles')
}
# load the mounted filesystems from our custom fact, remove boot
$mount_points = delete(split($::mounts, ','), '/boot')
$primary_controller = hiera('primary_controller')
$disks = hiera('corosync_disks', $mount_points)
$min_disk_free = hiera('corosync_min_disk_space', '100M')
$disk_unit = hiera('corosync_disk_unit', 'M')
$monitor_interval = hiera('corosync_disk_monitor_interval', '30s')
class { 'cluster::sysinfo':
primary_controller => $primary_controller,
disks => $disks,
min_disk_free => $min_disk_free,
disk_unit => $disk_unit,
monitor_interval => $monitor_interval,
}

View File

@ -0,0 +1,13 @@
require File.join File.dirname(__FILE__), '../test_common.rb'
class HealthPostTest < Test::Unit::TestCase
def ubuntu?
TestCommon::Facts.operatingsystem == 'Ubuntu'
end
def test_sysinfo_resource_started
return unless ubuntu?
assert TestCommon::Pacemaker.primitive_present?('sysinfo'), 'sysinfo is not created!'
end
end

View File

@ -9,3 +9,15 @@
timeout: 3600
test_post:
cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/cluster_post.rb
- id: cluster_health
type: puppet
groups: [primary-controller, controller]
required_for: [deploy_end]
requires: [cluster]
parameters:
puppet_manifest: /etc/puppet/modules/osnailyfacter/modular/cluster/health.pp
puppet_modules: /etc/puppet/modules
timeout: 600
test_post:
cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/health_post.rb

View File

@ -0,0 +1,45 @@
require 'spec_helper'
require 'shared-examples'
manifest = 'cluster/health.pp'
describe manifest do
shared_examples 'catalog' do
let(:primary_controller) do
Noop.hiera 'primary_controller'
end
let(:facts) {
Noop.ubuntu_facts.merge({
:mounts => '/,/boot,/var/log,/var/lib/glance,/var/lib/mysql'
})
}
let(:disks) do
Noop.hiera 'corosync_disk_monitor', ['/', '/var/log', '/var/lib/glance', '/var/lib/mysql']
end
let(:min_disk_free) do
Noop.hiera 'corosync_min_disk_space', '100M'
end
let(:disk_unit) do
Noop.hiera 'corosync_disk_unit', 'M'
end
let(:monitor_interval) do
Noop.hiera 'corosync_monitor_interval', '30s'
end
it {
should contain_class('cluster::sysinfo').with(
:primary_controller => primary_controller,
:disks => disks,
:min_disk_free => min_disk_free,
:disk_unit => disk_unit,
:monitor_interval => monitor_interval
)
}
end
test_ubuntu_and_centos manifest
end