Browse Source

Document the cluster policies

This change also adds the ability to insert SVG drawings in the RST
documentation.

Change-Id: I45127fad6832c81208135af2246dbbaab9257180
(cherry picked from commit 619e700a24)
tags/0.8rc1^0
Simon Pasquier 3 years ago
parent
commit
c064c17c71
4 changed files with 1226 additions and 5 deletions
  1. 1
    0
      doc/.gitignore
  2. 16
    2
      doc/Makefile
  3. 1056
    0
      doc/images/AFD_and_GSE_message_flow.svg
  4. 153
    3
      doc/source/user/alarms.rst

+ 1
- 0
doc/.gitignore View File

@@ -1 +1,2 @@
1 1
 build/
2
+images/*.pdf

+ 16
- 2
doc/Makefile View File

@@ -18,6 +18,12 @@ PAPEROPT_letter = -D latex_paper_size=letter
18 18
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
19 19
 # the i18n builder cannot share the environment and doctrees with the others
20 20
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
21
+# SVG to PDF conversion
22
+SVG2PDF       = inkscape
23
+SVG2PDF_FLAGS =
24
+# Build a list of SVG files to convert to PDF
25
+PDF_FILES := $(foreach dir, images, $(patsubst %.svg,%.pdf,$(wildcard $(dir)/*.svg)))
26
+
21 27
 
22 28
 .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
23 29
 
@@ -48,6 +54,7 @@ help:
48 54
 
49 55
 clean:
50 56
 	rm -rf $(BUILDDIR)/*
57
+	rm -f $(PDF_FILES)
51 58
 
52 59
 html:
53 60
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@@ -103,14 +110,14 @@ epub:
103 110
 	@echo
104 111
 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 112
 
106
-latex:
113
+latex: $(PDF_FILES)
107 114
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 115
 	@echo
109 116
 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 117
 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 118
 	      "(use \`make latexpdf' here to do that automatically)."
112 119
 
113
-latexpdf:
120
+latexpdf: $(PDF_FILES)
114 121
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 122
 	@echo "Running LaTeX files through pdflatex..."
116 123
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
@@ -175,3 +182,10 @@ pseudoxml:
175 182
 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 183
 	@echo
177 184
 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
185
+
186
+# Rule for building the PDF files only
187
+images: $(PDF_FILES)
188
+
189
+# Pattern rule for converting SVG to PDF
190
+%.pdf : %.svg
191
+	$(SVG2PDF) -f $< -A $@

+ 1056
- 0
doc/images/AFD_and_GSE_message_flow.svg
File diff suppressed because it is too large
View File


+ 153
- 3
doc/source/user/alarms.rst View File

@@ -34,6 +34,14 @@ and the *GSE plugins* for Global Status Evaluation plugins.
34 34
 Both the AFD and GSE plugins in turn create metrics called the *AFD metrics*
35 35
 and the *GSE metrics* respectively.
36 36
 
37
+
38
+.. figure:: ../../images/AFD_and_GSE_message_flow.*
39
+   :width: 800
40
+   :alt: Message flow for the AFD and GSE metrics
41
+   :align: center
42
+
43
+   Message flow for the AFD and GSE metrics
44
+
37 45
 The *AFD metrics* contain information about the health status of a
38 46
 resource like a device, a system component like a filesystem, or service
39 47
 like an API endpoint, at the node level.
@@ -49,7 +57,29 @@ The health status of a cluster is inferred by the GSE plugins using
49 57
 aggregation and correlation rules and facts contained in the
50 58
 *AFD metrics* it receives from the Collectors.
51 59
 
52
-The *AFD and GSE metrics* are consumed by other groups
60
+In the current version of the LMA Toolchain, three GSE plugins are configured:
61
+
62
+* The Service Cluster GSE which receives metrics from the AFD plugins monitoring the services and emits health status for the clusters of services (nova-api, nova-scheduler and so on).
63
+* The Node Cluster GSE which receives metrics from the AFD plugins monitoring the system and emits health status for the clusters of nodes (controllers, computes and so on).
64
+* The Global Cluster GSE which receives metrics from the two other GSE plugins and emits health status for the top-level clusters (Nova, MySQL and so on).
65
+
66
+The meaning associated with a health status is the following:
67
+
68
+* **Down**: One or several primary functions of a cluster are failed. For example,
69
+  the API service for Nova or Cinder isn't accessible.
70
+* **Critical**: One or several primary functions of a
71
+  cluster are severely degraded. The quality
72
+  of service delivered to the end-user should be severely
73
+  impacted.
74
+* **Warning**: One or several primary functions of the
75
+  cluster are slightly degraded. The quality
76
+  of service delivered to the end-user should be slightly
77
+  impacted.
78
+* **Unknown**: There is not enough data to infer the actual
79
+  health state of the cluster.
80
+* **Okay**: None of the above was found to be true.
81
+
82
+The *AFD and GSE metrics* are also consumed by other groups
53 83
 of Heka plugins we call the *Persisters*.
54 84
 
55 85
 * There is a *Persister* for InfluxDB which turns the *GSE metric*
@@ -162,7 +192,7 @@ Where:
162 192
     system mount point. If value is specified as an empty string (""), then the rule
163 193
     is applied to all the aggregated values for the specified field name like for example
164 194
     the file system mount point.
165
-    If value is specified as the ‘*’ wildcard character,
195
+    If value is specified as the '*' wildcard character,
166 196
     then the rule is applied to each of the metrics matching the metric name and field name.
167 197
     For example, the alarm definition sample given above would run the rule
168 198
     for each of the file system mount points associated with the *fs_space_percent_free* metric.
@@ -177,7 +207,7 @@ Where:
177 207
 |   not implemented yet)
178 208
 
179 209
 | function
180
-|   Type: enum(‘last’ | ‘min’ | ‘max’ | ‘sum’ | ‘count’ | ‘avg’ | ‘median’ | ‘mode’ | ‘roc’ | ‘mww’ | ‘mww_nonparametric’)
210
+|   Type: enum('last' | 'min' | 'max' | 'sum' | 'count' | 'avg' | 'median' | 'mode' | 'roc' | 'mww' | 'mww_nonparametric')
181 211
 |   Where:
182 212
 |     last:
183 213
 |       returns the last value of all the values
@@ -331,3 +361,123 @@ need to re-apply the Puppet module::
331 361
     /etc/fuel/plugins/lma_collector-0.8/puppet/manifests/configure_afd_filters.pp
332 362
 
333 363
 This will restart the LMA Collector with your change.
364
+
365
+Cluster policies
366
+----------------
367
+
368
+GSE plugins are driven by policies that describe how plugins determine the
369
+cluster's health status.
370
+
371
+By default, two policies are defined:
372
+
373
+* *highest_severity*, it defines that the cluster's status depends on the
374
+  member with the highest severity, typically used for a cluster of services.
375
+* *majority_of_members*, it defines that the cluster is healthy as long as
376
+  (N+1)/2 members of the cluster are healthy. This is typically used for
377
+  clusters managed by Pacemaker.
378
+
379
+The GSE policies are defined declaratively in the */etc/hiera/override/gse_filters.yaml*
380
+file at the *gse_policies* entry.
381
+
382
+A policy consists of a list of rules which are evaluated against the
383
+current status of the cluster's members. When one of the rules matches, the
384
+cluster's status gets the value associated with the rule and the evaluation
385
+stops here. The last rule of the list is usually a catch-all rule that
386
+defines the default status in case none of the previous rules could be matched.
387
+
388
+A policy rule is defined as shown in the example below::
389
+
390
+   # The following rule definition reads as: "the cluster's status is critical if more than 50% of its members are either down or criticial"
391
+   - status: critical
392
+     trigger:
393
+       logical_operator: or
394
+       rules:
395
+         - function: percent
396
+           arguments: [ down, critical ]
397
+           relational_operator: '>'
398
+           threshold: 50
399
+
400
+Where
401
+
402
+| status:
403
+|   Type: Enum(down, critical, warning, okay, unknown)
404
+|   The cluster's status if the condition is met
405
+
406
+| logical_operator
407
+|    Type: Enum('and' | '&&' | 'or' | '||')
408
+|    The conjonction relation for the condition rules
409
+
410
+| rules
411
+|    Type: list
412
+|    List of condition rules to execute
413
+
414
+| function
415
+|   Type: enum('count' | 'percent')
416
+|   Where:
417
+|     count:
418
+|       returns the *number of members* that match the passed value(s).
419
+|     percent:
420
+|       returns the *percentage of members* that match the passed value(s).
421
+
422
+| arguments:
423
+|    Type: list of status values
424
+|    List of status values passed to the function
425
+
426
+| relational_operator:
427
+|    Type: Enum('lt' | '<' | 'gt' | '>' | 'lte' | '<=' | 'gte' | '>=')
428
+|    The comparison against the threshold
429
+
430
+| threshold
431
+|   Type: float
432
+|   The threshold value
433
+
434
+Lets now take a more detailed look at the policy called *highest_severity*::
435
+
436
+  gse_policies:
437
+
438
+    highest_severity:
439
+      - status: down
440
+        trigger:
441
+          logical_operator: or
442
+          rules:
443
+            - function: count
444
+              arguments: [ down ]
445
+              relational_operator: '>'
446
+              threshold: 0
447
+      - status: critical
448
+        trigger:
449
+          logical_operator: or
450
+          rules:
451
+            - function: count
452
+              arguments: [ critical ]
453
+              relational_operator: '>'
454
+              threshold: 0
455
+      - status: warning
456
+        trigger:
457
+          logical_operator: or
458
+          rules:
459
+            - function: count
460
+              arguments: [ warning ]
461
+              relational_operator: '>'
462
+              threshold: 0
463
+      - status: okay
464
+        trigger:
465
+          logical_operator: or
466
+          rules:
467
+            - function: count
468
+              arguments: [ okay ]
469
+              relational_operator: '>'
470
+              threshold: 0
471
+      - status: unknown
472
+
473
+The policy definition reads as:
474
+
475
+* The status of the cluster is *Down* if the status of at least one cluster's member is *Down*.
476
+
477
+* Otherwise the status of the cluster is *Critical* if the status of at least one cluster's member is *Critical*.
478
+
479
+* Otherwise the status of the cluster is *Warning* if the status of at least one cluster's member is *Warning*.
480
+
481
+* Otherwise the status of the cluster is *Okay* if the status of at least one cluster's entity is *Okay*.
482
+
483
+* Otherwise the status of the cluster is *Unknown*.

Loading…
Cancel
Save