Add cluster health task with disk monitor

This change adds a monitor into corosync/pacemaker to migrate services if the monitored disks drop below 100M free. Once the operator has resolved the full disk, they must clear the alarm by running: crm node status-attr <hostname> delete "#health_disk" After the alarm has been cleared, the services should be automatically restarted. This change is not a replacement for proper monitoring, but it will properly shut down and migrate services if a controller runs out of disk space. DocImpact Closes-Bug: 1493520 Change-Id: I8a2cb4bd8d0b6070400d13e25d2310f4777b9faf
2015-09-21 16:29:56 -05:00 · 2015-09-21 16:29:56 -05:00 · 03e7683381
commit 03e7683381
parent 2f1b1954de
6 changed files with 187 additions and 0 deletions
--- a/deployment/puppet/cluster/manifests/sysinfo.pp
+++ b/deployment/puppet/cluster/manifests/sysinfo.pp
@ -0,0 +1,70 @@
+# == Class: cluster::sysinfo
+#
+# Configure pacemaker sysinfo disk monitor
+#
+# === Parameters
+#
+# [*primary_controller*]
+#  (required) Boolean to indicate if this is the primary controller or not. The
+#  resources only get defined on the primary controller for the cluster but the
+#  location is defined on any node on this cluster that this should run on.
+#
+# [*disks*]
+#  (optional) array of mount points to monitor for free space. / is monitored
+#  by default it does not need to be specified.
+#  Defaults to []
+#
+# [*min_disk_free*]
+#  (optional) Minimum amount of free space required for the paritions
+#  Defaults to '100M'
+#
+# [*disk_unit*]
+#  (optional) Unit for disk space
+#  Defaults to 'M'
+#
+# [*monitor_interval*]
+#  (optional) Internval to monitor free space
+#  Defaults to '60s'
+#
+# [*monitor_ensure*]
+#  (optional) Ensure the corosync monitor is installed
+#  Defaults to present
+#
+class cluster::sysinfo (
+  $primary_controller,
+  $disks            = [],
+  $min_disk_free    = '100M',
+  $disk_unit        = 'M',
+  $monitor_interval = '15s',
+  $monitor_ensure   = present,
+) {
+
+  if $primary_controller {
+    cs_resource { 'sysinfo':
+      ensure          => $monitor_ensure,
+      primitive_class => 'ocf',
+      provided_by     => 'pacemaker',
+      primitive_type  => 'SysInfo',
+      complex_type    => 'clone',
+      parameters      => {
+        'disks'         => join(any2array($disks), ' '),
+        'min_disk_free' => $min_disk_free,
+        'disk_unit'     => $disk_unit,
+      },
+      operations      => { 'monitor' => { 'interval' => $monitor_interval } },
+    }
+
+    # Have service migrate if health turns red from the failed disk check
+    cs_property { 'node-health-strategy':
+      ensure   => present,
+      value    => 'migrate-on-red',
+      provider => 'crm',
+    }
+  }
+
+  cs_location { "clone_sysinfo-on-${::fqdn}":
+    primitive => 'clone_sysinfo',
+    node_name => $::fqdn,
+    score     => 'INFINITY',
+  }
+}
--- a/deployment/puppet/osnailyfacter/lib/facter/mounts.rb
+++ b/deployment/puppet/osnailyfacter/lib/facter/mounts.rb
@ -0,0 +1,25 @@
+#
+# This fact returns the currently mounted ext{2,3,4}, xfs or btrfs disks as a
+# comma seperate list.
+#
+
+mounts = []
+case Facter.value(:kernel)
+  when 'Linux'
+    include_filesystems = ['ext[2-4]', 'xfs', 'btrfs']
+    filesystems_re = Regexp.new(include_filesystems.join('|'))
+    File.open('/proc/mounts').each do |line|
+      mount = line.split(' ')[1] if filesystems_re.match(line)
+      # if for some reason the mount line is not properly formated, this
+      # prevents nil from being added to the mounts. For example a line that
+      # only has 'xfs' would return nil
+      mounts << mount unless mount.nil?
+    end
+end
+
+Facter.add(:mounts) do
+  setcode do
+     mounts.join(",")
+  end
+end
+
--- a/deployment/puppet/osnailyfacter/modular/cluster/health.pp
+++ b/deployment/puppet/osnailyfacter/modular/cluster/health.pp
@ -0,0 +1,22 @@
+notice('MODULAR: cluster/health.pp')
+
+if !(hiera('role') in hiera('corosync_roles')) {
+    fail('The node role is not in corosync roles')
+}
+
+# load the mounted filesystems from our custom fact, remove boot
+$mount_points = delete(split($::mounts, ','), '/boot')
+
+$primary_controller = hiera('primary_controller')
+$disks              = hiera('corosync_disks', $mount_points)
+$min_disk_free      = hiera('corosync_min_disk_space', '100M')
+$disk_unit          = hiera('corosync_disk_unit', 'M')
+$monitor_interval   = hiera('corosync_disk_monitor_interval', '30s')
+
+class { 'cluster::sysinfo':
+  primary_controller => $primary_controller,
+  disks              => $disks,
+  min_disk_free      => $min_disk_free,
+  disk_unit          => $disk_unit,
+  monitor_interval   => $monitor_interval,
+}
--- a/deployment/puppet/osnailyfacter/modular/cluster/health_post.rb
+++ b/deployment/puppet/osnailyfacter/modular/cluster/health_post.rb
@ -0,0 +1,13 @@
+require File.join File.dirname(__FILE__), '../test_common.rb'
+
+class HealthPostTest < Test::Unit::TestCase
+  def ubuntu?
+    TestCommon::Facts.operatingsystem == 'Ubuntu'
+  end
+
+  def test_sysinfo_resource_started
+    return unless ubuntu?
+    assert TestCommon::Pacemaker.primitive_present?('sysinfo'), 'sysinfo is not created!'
+  end
+end
+
--- a/deployment/puppet/osnailyfacter/modular/cluster/tasks.yaml
+++ b/deployment/puppet/osnailyfacter/modular/cluster/tasks.yaml
@ -9,3 +9,15 @@
    timeout: 3600
  test_post:
    cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/cluster_post.rb
+
+- id: cluster_health
+  type: puppet
+  groups: [primary-controller, controller]
+  required_for: [deploy_end]
+  requires: [cluster]
+  parameters:
+    puppet_manifest: /etc/puppet/modules/osnailyfacter/modular/cluster/health.pp
+    puppet_modules: /etc/puppet/modules
+    timeout: 600
+  test_post:
+    cmd: ruby /etc/puppet/modules/osnailyfacter/modular/cluster/health_post.rb
--- a/tests/noop/spec/hosts/cluster/health_spec.rb
+++ b/tests/noop/spec/hosts/cluster/health_spec.rb
@ -0,0 +1,45 @@
+require 'spec_helper'
+require 'shared-examples'
+manifest = 'cluster/health.pp'
+
+describe manifest do
+  shared_examples 'catalog' do
+    let(:primary_controller) do
+      Noop.hiera 'primary_controller'
+    end
+    let(:facts) {
+      Noop.ubuntu_facts.merge({
+        :mounts => '/,/boot,/var/log,/var/lib/glance,/var/lib/mysql'
+      })
+    }
+
+    let(:disks) do
+      Noop.hiera 'corosync_disk_monitor', ['/', '/var/log', '/var/lib/glance', '/var/lib/mysql']
+    end
+
+    let(:min_disk_free) do
+      Noop.hiera 'corosync_min_disk_space', '100M'
+    end
+
+    let(:disk_unit) do
+      Noop.hiera 'corosync_disk_unit', 'M'
+    end
+
+    let(:monitor_interval) do
+      Noop.hiera 'corosync_monitor_interval', '30s'
+    end
+
+    it {
+      should contain_class('cluster::sysinfo').with(
+        :primary_controller => primary_controller,
+        :disks              => disks,
+        :min_disk_free      => min_disk_free,
+        :disk_unit          => disk_unit,
+        :monitor_interval   => monitor_interval
+      )
+    }
+
+  end
+  test_ubuntu_and_centos manifest
+end
+