Puppet module to manage log processor
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

classify-log.crm 5.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. #! /usr/bin/crm
  2. #
  3. # Copyright 2013 OpenStack Foundation
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6. # not use this file except in compliance with the License. You may obtain
  7. # a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. # License for the specific language governing permissions and limitations
  15. # under the License.
  16. # This script trains an OSB (Orthogonal Sparse Bigram) bayesian filter
  17. # with log lines from test runs and classifies each line according to
  18. # the likelyhood it indicates an error. Very little experimentation
  19. # has been done to determine the best classifier and training method;
  20. # further experimentation may be useful.
  21. # The training method is TET -- Train Every Thing. This is not
  22. # normally advised as a training method for Bayesian filters. In
  23. # experiments, it identified about twice as many lines as being
  24. # associated with errers as were indicated by a TOE (Train On Error)
  25. # method. Some of them were false positives, but many were not, and
  26. # of those, it had a much higher (pR ~= 37) confidence in them than
  27. # TOE. TET seems to give qualitatively better results when filtering
  28. # for higher pR values.
  29. # Set unbuffered IO
  30. window
  31. # Base component of path to data files
  32. isolate (:prefix:) /:*:_arg2:/
  33. # Whether this run is for a SUCCESS or FAILURE result
  34. isolate (:target:) /:*:_arg3:/
  35. # Train each file on a newline just to make sure it exists
  36. learn [:_nl:] <osb unique microgroom> (:*:prefix:/SUCCESS.css)
  37. learn [:_nl:] <osb unique microgroom> (:*:prefix:/FAILURE.css)
  38. {
  39. # Iterate over each line
  40. window <bychar> /\n/ /\n/
  41. {
  42. isolate (:stats:)
  43. isolate (:result:)
  44. isolate (:prob:)
  45. isolate (:pr:)
  46. # Save a copy of this line
  47. isolate (:line:) /:*:_dw:/
  48. {
  49. {
  50. # Remove things that look like timestamps from the beginning of the line
  51. match (:timestamp:) /^[-.0-9 |:]+/
  52. alter (:timestamp:) //
  53. }
  54. {
  55. # Don't treat UUIDs as uniquely special.
  56. match (:uuidtoken:) /[[:xdigit:]]{8}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{12}/
  57. alter (:uuidtoken:) /UUIDTOKEN/
  58. {
  59. match (:uuidtoken:) <fromnext> /[[:xdigit:]]{8}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{12}/
  60. alter (:uuidtoken:) /UUIDTOKEN/
  61. # Loop to replace all TOKENS in line
  62. liaf
  63. }
  64. }
  65. {
  66. # Don't treat IDs as uniquely special.
  67. match (:idtoken:) /[[:xdigit:]]{32,40}/
  68. alter (:idtoken:) /IDTOKEN/
  69. {
  70. match (:idtoken:) <fromnext> /[[:xdigit:]]{32,40}/
  71. alter (:idtoken:) /IDTOKEN/
  72. # Loop to replace all TOKENS in line
  73. liaf
  74. }
  75. }
  76. {
  77. # Don't treat IDs as uniquely special.
  78. match (:numtoken:) /-[[:digit:]]{7,}/
  79. alter (:numtoken:) /-NUMTOKEN/
  80. {
  81. match (:numtoken:) <fromnext> /-[[:digit:]]{7,}/
  82. alter (:numtoken:) /-NUMTOKEN/
  83. # Loop to replace all TOKENS in line
  84. liaf
  85. }
  86. }
  87. # Train on the line
  88. learn <osb unique microgroom> (:*:prefix:/:*:target:.css)
  89. # Classify the line to see if it looks more like a SUCCESS or FAILURE line
  90. classify <osb unique microgroom> (:*:prefix:/SUCCESS.css :*:prefix:/FAILURE.css) (:stats:)
  91. {
  92. # The stats variable looks like:
  93. # CLASSIFY succeeds; success probability: 1.0000 pR: 304.6527
  94. # Best match to file #0 (/tmp/crm114/console_html/SUCCESS.css) prob: 0.9933 pR: 2.1720
  95. # Total features in input file: 20
  96. # #0 (/tmp/crm114/console_html/SUCCESS.css): features: 3544235, hits: 901854, prob: 9.93e-01, pR: 2.17
  97. # #1 (/tmp/crm114/console_html/FAILURE.css): features: 1, hits: 0, prob: 6.69e-03, pR: -2.17
  98. # Pull out the filename, probability, and pR (a kind of logarithmic probability, see CRM docs)
  99. match [:stats:] <nomultiline> /^Best match to .*\/([A-Za-z]+).css\) prob: ([-.0-9]+) pR: ([-.0-9]+)/ ( :: :result: :prob: :pr: )
  100. {
  101. # If this line is classified as FAILURE, negate
  102. # the pR value (which will always be positive).
  103. # Do this by prepending a '-' or the empty string.
  104. {
  105. match [:result:] /FAILURE/
  106. alter (:result:) /-/
  107. } alius {
  108. alter (:result:) //
  109. }
  110. }
  111. # Output the sign and pR value for this line.
  112. output /:*:result::*:pr:\n/
  113. }
  114. }
  115. }
  116. liaf
  117. }