From 03e7683381d14c4a9d5da93481b2d5140e7896f0 Mon Sep 17 00:00:00 2001 From: Alex Schultz Date: Mon, 21 Sep 2015 16:29:56 -0500 Subject: [PATCH] Add cluster health task with disk monitor This change adds a monitor into corosync/pacemaker to migrate services if the monitored disks drop below 100M free. Once the operator has resolved the full disk, they must clear the alarm by running: crm node status-attr delete "#health_disk" After the alarm has been cleared, the services should be automatically restarted. This change is not a replacement for proper monitoring, but it will properly shut down and migrate services if a controller runs out of disk space. DocImpact Closes-Bug: 1493520 Change-Id: I8a2cb4bd8d0b6070400d13e25d2310f4777b9faf --- .../puppet/cluster/manifests/sysinfo.pp | 70 +++++++++++++++++++ .../puppet/osnailyfacter/lib/facter/mounts.rb | 25 +++++++ .../osnailyfacter/modular/cluster/health.pp | 22 ++++++ .../modular/cluster/health_post.rb | 13 ++++ .../osnailyfacter/modular/cluster/tasks.yaml | 12 ++++ tests/noop/spec/hosts/cluster/health_spec.rb | 45 ++++++++++++ 6 files changed, 187 insertions(+) create mode 100644 deployment/puppet/cluster/manifests/sysinfo.pp create mode 100644 deployment/puppet/osnailyfacter/lib/facter/mounts.rb create mode 100644 deployment/puppet/osnailyfacter/modular/cluster/health.pp create mode 100644 deployment/puppet/osnailyfacter/modular/cluster/health_post.rb create mode 100644 tests/noop/spec/hosts/cluster/health_spec.rb diff --git a/deployment/puppet/cluster/manifests/sysinfo.pp b/deployment/puppet/cluster/manifests/sysinfo.pp new file mode 100644 index 0000000000..0511acfef5 --- /dev/null +++ b/deployment/puppet/cluster/manifests/sysinfo.pp @@ -0,0 +1,70 @@ +# == Class: cluster::sysinfo +# +# Configure pacemaker sysinfo disk monitor +# +# === Parameters +# +# [*primary_controller*] +# (required) Boolean to indicate if this is the primary controller or not. The +# resources only get defined on the primary controller for the cluster but the +# location is defined on any node on this cluster that this should run on. +# +# [*disks*] +# (optional) array of mount points to monitor for free space. / is monitored +# by default it does not need to be specified. +# Defaults to [] +# +# [*min_disk_free*] +# (optional) Minimum amount of free space required for the paritions +# Defaults to '100M' +# +# [*disk_unit*] +# (optional) Unit for disk space +# Defaults to 'M' +# +# [*monitor_interval*] +# (optional) Internval to monitor free space +# Defaults to '60s' +# +# [*monitor_ensure*] +# (optional) Ensure the corosync monitor is installed +# Defaults to present +# +class cluster::sysinfo ( + $primary_controller, + $disks = [], + $min_disk_free = '100M', + $disk_unit = 'M', + $monitor_interval = '15s', + $monitor_ensure = present, +) { + + if $primary_controller { + cs_resource { 'sysinfo': + ensure => $monitor_ensure, + primitive_class => 'ocf', + provided_by => 'pacemaker', + primitive_type => 'SysInfo', + complex_type => 'clone', + parameters => { + 'disks' => join(any2array($disks), ' '), + 'min_disk_free' => $min_disk_free, + 'disk_unit' => $disk_unit, + }, + operations => { 'monitor' => { 'interval' => $monitor_interval } }, + } + + # Have service migrate if health turns red from the failed disk check + cs_property { 'node-health-strategy': + ensure => present, + value => 'migrate-on-red', + provider => 'crm', + } + } + + cs_location { "clone_sysinfo-on-${::fqdn}": + primitive => 'clone_sysinfo', + node_name => $::fqdn, + score => 'INFINITY', + } +} diff --git a/deployment/puppet/osnailyfacter/lib/facter/mounts.rb b/deployment/puppet/osnailyfacter/lib/facter/mounts.rb new file mode 100644 index 0000000000..6ebc8d395f --- /dev/null +++ b/deployment/puppet/osnailyfacter/lib/facter/mounts.rb @@ -0,0 +1,25 @@ +# +# This fact returns the currently mounted ext{2,3,4}, xfs or btrfs disks as a +# comma seperate list. +# + +mounts = [] +case Facter.value(:kernel) + when 'Linux' + include_filesystems = ['ext[2-4]', 'xfs', 'btrfs'] + filesystems_re = Regexp.new(include_filesystems.join('|')) + File.open('/proc/mounts').each do |line| + mount = line.split(' ')[1] if filesystems_re.match(line) + # if for some reason the mount line is not properly formated, this + # prevents nil from being added to the mounts. For example a line that + # only has 'xfs' would return nil + mounts << mount unless mount.nil? + end +end + +Facter.add(:mounts) do + setcode do + mounts.join(",") + end +end + diff --git a/deployment/puppet/osnailyfacter/modular/cluster/health.pp b/deployment/puppet/osnailyfacter/modular/cluster/health.pp new file mode 100644 index 0000000000..8d58d2c99e --- /dev/null +++ b/deployment/puppet/osnailyfacter/modular/cluster/health.pp @@ -0,0 +1,22 @@ +notice('MODULAR: cluster/health.pp') + +if !(hiera('role') in hiera('corosync_roles')) { + fail('The node role is not in corosync roles') +} + +# load the mounted filesystems from our custom fact, remove boot +$mount_points = delete(split($::mounts, ','), '/boot') + +$primary_controller = hiera('primary_controller') +$disks = hiera('corosync_disks', $mount_points) +$min_disk_free = hiera('corosync_min_disk_space', '100M') +$disk_unit = hiera('corosync_disk_unit', 'M') +$monitor_interval = hiera('corosync_disk_monitor_interval', '30s') + +class { 'cluster::sysinfo': + primary_controller => $primary_controller, + disks => $disks, + min_disk_free => $min_disk_free, + disk_unit => $disk_unit, + monitor_interval => $monitor_interval, +} diff --git a/deployment/puppet/osnailyfacter/modular/cluster/health_post.rb b/deployment/puppet/osnailyfacter/modular/cluster/health_post.rb new file mode 100644 index 0000000000..8ec4e0105c --- /dev/null +++ b/deployment/puppet/osnailyfacter/modular/cluster/health_post.rb @@ -0,0 +1,13 @@ +require File.join File.dirname(__FILE__), '../test_common.rb' + +class HealthPostTest < Test::Unit::TestCase + def ubuntu? + TestCommon::Facts.operatingsystem == 'Ubuntu' + end + + def test_sysinfo_resource_started + return unless ubuntu? + assert TestCommon::Pacemaker.primitive_present?('sysinfo'), 'sysinfo is not created!' + end +end + diff --git a/deployment/puppet/osnailyfacter/modular/cluster/tasks.yaml b/deployment/puppet/osnailyfacter/modular/cluster/tasks.yaml index a0fd019782..a7841d2ee7 100644 --- a/deployment/puppet/osnailyfacter/modular/cluster/tasks.yaml +++ b/deployment/puppet/osnailyfacter/modular/cluster/tasks.yaml @@ -9,3 +9,15 @@ timeout: 3600 test_post: cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/cluster_post.rb + +- id: cluster_health + type: puppet + groups: [primary-controller, controller] + required_for: [deploy_end] + requires: [cluster] + parameters: + puppet_manifest: /etc/puppet/modules/osnailyfacter/modular/cluster/health.pp + puppet_modules: /etc/puppet/modules + timeout: 600 + test_post: + cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/health_post.rb diff --git a/tests/noop/spec/hosts/cluster/health_spec.rb b/tests/noop/spec/hosts/cluster/health_spec.rb new file mode 100644 index 0000000000..2846b574f9 --- /dev/null +++ b/tests/noop/spec/hosts/cluster/health_spec.rb @@ -0,0 +1,45 @@ +require 'spec_helper' +require 'shared-examples' +manifest = 'cluster/health.pp' + +describe manifest do + shared_examples 'catalog' do + let(:primary_controller) do + Noop.hiera 'primary_controller' + end + let(:facts) { + Noop.ubuntu_facts.merge({ + :mounts => '/,/boot,/var/log,/var/lib/glance,/var/lib/mysql' + }) + } + + let(:disks) do + Noop.hiera 'corosync_disk_monitor', ['/', '/var/log', '/var/lib/glance', '/var/lib/mysql'] + end + + let(:min_disk_free) do + Noop.hiera 'corosync_min_disk_space', '100M' + end + + let(:disk_unit) do + Noop.hiera 'corosync_disk_unit', 'M' + end + + let(:monitor_interval) do + Noop.hiera 'corosync_monitor_interval', '30s' + end + + it { + should contain_class('cluster::sysinfo').with( + :primary_controller => primary_controller, + :disks => disks, + :min_disk_free => min_disk_free, + :disk_unit => disk_unit, + :monitor_interval => monitor_interval + ) + } + + end + test_ubuntu_and_centos manifest +end +