move queries.yaml into a queries subdir

this handles the piece of work we've been talking about for a while
in moving the queries.yaml file into a directory with a bunch of
files. These remain yaml so that they can be tagged with additional
metadata. This would support the concept of soft deleting as well
as other useful meta data to gauge our evolution of the bugs we
track over time.

This should see some real review as it's extensive enough of a
change that the existing tests might not be sufficient. However it
should be enough to move this forward quite a bit.

This also makes future looking statements about doing soft deletes
with a resolved_at keyword in the future. That implementation will
come later.

Change-Id: I86317fcf6f1886ab5b6c0ee154b29e71865c52b7
This commit is contained in:
Sean Dague 2013-11-29 22:07:08 -05:00
parent 142d520dba
commit 932986a876
48 changed files with 200 additions and 201 deletions

View File

@ -14,16 +14,25 @@ When a tempest job failure is detected, by monitoring gerrit (using gerritlib),
Eventually this can be tied into the rechecker tool and launchpad
queries.yaml
queries/
------------
All queries are stored in a yaml file called: queries.yaml
All queries are stored in separate yaml files in a queries directory
at the top of the elastic_recheck code base. The format of these files
is ######.yaml (where ###### is the bug number), the yaml should have
a ``query`` keyword which is the query text for elastic search.
Guidelines for queries.yaml
Guidelines for good queries
- After a bug is resolved and has no more hits in elasticsearch, it should be removed
- After a bug is resolved and has no more hits in elasticsearch, we
should flag it with a resolved_at keyword. This will let us keep
some memory of past bugs, and see if they come back. (Note: this is
a forward looking statement, sorting out resolved_at will come in
the future)
- Queries should get as close as possible to fingerprinting the root cause
- Queries should not return any hits for successful jobs, this is a sign the query isn't specific enough
- Queries should not return any hits for successful jobs, this is a
sign the query isn't specific enough
Future Work
------------
@ -34,6 +43,7 @@ Future Work
- Split out queries repo
- Expand gating testing
- Cleanup and document code better
- Sort out resolved_at stamping to remove active bugs
- Move away from polling ElasticSearch to discover if its ready or not
- Add nightly job to propose a patch to remove bug queries that return no hits -- Bug hasn't been seen in 2 weeks and must be closed

View File

@ -9,5 +9,5 @@ channel_config=/home/mtreinish/elasticRecheck/recheckwatchbot.yaml
[gerrit]
user=treinish
host=review.openstack.org
query_file=/home/mtreinish/elasticRecheck/queries.yaml
query_file=/home/mtreinish/elasticRecheck/queries
key=/home/mtreinish/.ssh/id_rsa

View File

@ -25,9 +25,10 @@ LPCACHEDIR = os.path.expanduser('~/.launchpadlib/cache')
def get_options():
parser = argparse.ArgumentParser(description='Edit hiera yaml.')
parser.add_argument('--file', '-f', help="Queries file",
default="queries.yaml")
parser = argparse.ArgumentParser(
description='Query for existing recheck bugs.')
parser.add_argument('--dir', '-d', help="Queries Directory",
default="queries")
return parser.parse_args()
@ -87,7 +88,7 @@ def get_launchpad_bug(bug):
def main():
opts = get_options()
classifier = er.Classifier(opts.file)
classifier = er.Classifier(opts.dir)
data = collect_metrics(classifier)
print_metrics(data)

View File

@ -24,8 +24,8 @@ import logging
import os
import sys
import time
import yaml
import elastic_recheck.loader as loader
import elastic_recheck.query_builder as qb
from elastic_recheck import results
@ -120,10 +120,10 @@ class Classifier():
queries = None
def __init__(self, queries):
def __init__(self, queries_dir):
self.es = results.SearchEngine(self.ES_URL)
self.queries = yaml.load(open(queries).read())
self.queries_filename = queries
self.queries_dir = queries_dir
self.queries = loader.load(self.queries_dir)
def hits_by_query(self, query, facet=None, size=100):
es_query = qb.generic(query, facet=facet)
@ -133,7 +133,7 @@ class Classifier():
"""Returns either empty list or list with matched bugs."""
self.log.debug("Entering classify")
#Reload each time
self.queries = yaml.load(open(self.queries_filename).read())
self.queries = loader.load(self.queries_dir)
#Wait till Elastic search is ready
self.log.debug("checking if ElasticSearch is ready")
if not self._is_ready(change_number, patch_number, comment):

34
elastic_recheck/loader.py Normal file
View File

@ -0,0 +1,34 @@
# Copyright Samsung Electronics 2013. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Loader for elastic search queries.
A set of utility methods to load queries for elastic recheck.
"""
import glob
import os.path
import yaml
def load(directory='queries'):
"""Load queries from a set of yaml files in a directory."""
bugs = glob.glob("%s/*.yaml" % directory)
data = []
for fname in bugs:
bugnum = os.path.basename(fname).rstrip('.yaml')
query = yaml.load(open(fname).read())
query['bug'] = bugnum
data.append(query)
return data

View File

@ -12,8 +12,8 @@
import fixtures
import json
import yaml
from elastic_recheck import loader
import elastic_recheck.tests
@ -36,7 +36,7 @@ class FakeES(object):
queries.yaml file, and grabbing the results we'd find for known bugs.
"""
def __init__(self, url):
self._yaml = yaml.load(open('elastic_recheck/tests/unit/queries.yaml').read())
self._yaml = loader.load('elastic_recheck/tests/unit/queries')
self._queries = {}
for item in self._yaml:
self._queries[item['query'].rstrip()] = item['bug']

View File

@ -1,83 +0,0 @@
- bug: 1226337
query: >
( @message:"NovaException: iSCSI device not found at"
OR message:"NovaException: iSCSI device not found at" )
AND filename:"logs/screen-n-cpu.txt"
- bug: 1211915
query: >
( @message:"ConnectionFailed: Connection to neutron failed: Maximum attempts reached"
OR message:"ConnectionFailed: Connection to neutron failed: Maximum attempts reached" )
AND filename:"console.html"
- bug: 1217734
query: >
( @message:"CalledProcessError: Command 'openssl' returned non-zero exit status"
OR message:"CalledProcessError: Command 'openssl' returned non-zero exit status" )
- bug: 1191960
query: >
(( @message:"Exit code: 5"
AND @message:" sudo cinder-rootwrap /etc/cinder/rootwrap.conf lvremove -f" )
OR ( message:"Exit code: 5"
AND message:" sudo cinder-rootwrap /etc/cinder/rootwrap.conf lvremove -f" ))
AND filename:"logs/screen-c-vol.txt"
- bug: 1225664
query: >
( @message:"Details: Time Limit Exceeded! (400s)while waiting for active, but we got killed."
OR message:"Details: Time Limit Exceeded! (400s)while waiting for active, but we got killed." )
AND filename:"console.html"
- bug: 1218391
query: >
( @message:"Cannot 'createImage'"
OR message:"Cannot 'createImage'" )
AND filename:"console.html"
- bug: 1229475
query: >
( @message:"Second simultaneous read on fileno"
OR message:"Second simultaneous read on fileno" )
- bug: 1230407
query: >
( @message:"Lock wait timeout exceeded; try restarting transaction"
OR message:"Lock wait timeout exceeded; try restarting transaction" )
AND filename:"logs/screen-q-svc.txt"
- bug: 1224001
query: >
( @message:"tempest.scenario.test_network_basic_ops AssertionError: Timed out waiting for"
OR message:"tempest.scenario.test_network_basic_ops AssertionError: Timed out waiting for" )
AND filename:"console.html"
- bug: 1235486
query: >
( @message:"update or delete on table \"networks\" violates foreign key constraint"
OR message:"update or delete on table \"networks\" violates foreign key constraint" )
AND filename:"logs/screen-q-svc.txt"
- bug: 1232748
query: >
( @message:"OperationalError: (OperationalError) could not translate host name \"localhost\" to address"
OR message:"OperationalError: (OperationalError) could not translate host name \"localhost\" to address" )
AND filename:"logs/screen-n-api.txt"
- bug: 1235435
query: >
(( @message:"One or more ports have an IP allocation from this subnet"
AND @message:" SubnetInUse: Unable to complete operation on subnet" )
OR ( message:"One or more ports have an IP allocation from this subnet"
AND message:" SubnetInUse: Unable to complete operation on subnet" ))
AND filename:"logs/screen-q-svc.txt"
- bug: 1235437
query: >
( @message:"failed to reach ACTIVE status within the required time (400 s). Current status: BUILD"
OR message:"failed to reach ACTIVE status within the required time (400 s). Current status: BUILD" )
AND filename:"console.html"
- bug: 1239637
query: >
( @message:"DBError: (IntegrityError) null value in column \"network_id\" violates not-null constraint"
OR message:"DBError: (IntegrityError) null value in column \"network_id\" violates not-null constraint" )
AND filename:"logs/screen-q-svc.txt"
- bug: 1239856
query: >
(( @message:"tempest/services" AND @message:"/images_client.py" AND @message:"wait_for_image_status" )
OR (message:"tempest/services" AND message:"/images_client.py" AND message:"wait_for_image_status" ))
AND filename:"console.html"
- bug: 1240256
query: >
( @message:" 503"
OR message:" 503" )
AND filename:"logs/syslog.txt"
AND syslog_program:"proxy-server"

View File

@ -0,0 +1,4 @@
query: >
message:"Exit code: 5"
AND message:" sudo cinder-rootwrap /etc/cinder/rootwrap.conf lvremove -f"
AND filename:"logs/screen-c-vol.txt"

View File

@ -0,0 +1,4 @@
query: >
message:"ConnectionFailed: Connection to neutron
failed: Maximum attempts reached"
AND filename:"console.html"

View File

@ -0,0 +1,3 @@
query: >
message:"CalledProcessError: Command ''openssl'' returned
non-zero exit status"

View File

@ -0,0 +1,3 @@
query: >
message:"Cannot ''createImage''"
AND filename:"console.html"

View File

@ -0,0 +1,4 @@
query: >
message:"tempest.scenario.test_network_basic_ops AssertionError:
Timed out waiting for"
AND filename:"console.html"

View File

@ -0,0 +1,4 @@
query: >
message:"No space left on device"
AND filename:"logs/syslog.txt"
AND syslog_program:"object-server"

View File

@ -0,0 +1,3 @@
query: >
message:"NovaException: iSCSI device not found at"
AND filename:"logs/screen-n-cpu.txt"

View File

@ -0,0 +1,2 @@
query: >
message:"Second simultaneous read on fileno"

View File

@ -0,0 +1,3 @@
query: >
message:"Lock wait timeout exceeded; try restarting transaction"
AND filename:"logs/screen-q-svc.txt"

View File

@ -0,0 +1,4 @@
query: >
message:"OperationalError: (OperationalError) could not translate
host name \"localhost\" to address"
AND filename:"logs/screen-n-api.txt"

View File

@ -0,0 +1,4 @@
query: >
message:"One or more ports have an IP allocation from this subnet"
AND message:" SubnetInUse: Unable to complete operation on subnet"
AND filename:"logs/screen-q-svc.txt"

View File

@ -0,0 +1,4 @@
query: >
message:"failed to reach ACTIVE status within the required time
(400 s). Current status: BUILD"
AND filename:"console.html"

View File

@ -0,0 +1,3 @@
query: >
message:"update or delete on table \"networks\" violates foreign key constraint"
AND filename:"logs/screen-q-svc.txt"

View File

@ -0,0 +1,4 @@
query: >
message:"DBError: (IntegrityError) null value in column \"network_id\"
violates not-null constraint"
AND filename:"logs/screen-q-svc.txt"

View File

@ -0,0 +1,3 @@
query: >
message:" Registry client request" AND message:"raised ClientConnectionError"
AND filename:"logs/screen-g-api.txt"

View File

@ -0,0 +1,4 @@
query: >
message:" 503"
AND filename:"logs/syslog.txt"
AND syslog_program:"proxy-server"

View File

@ -12,14 +12,13 @@
# License for the specific language governing permissions and limitations
# under the License.
import yaml
from elastic_recheck import elasticRecheck
from elastic_recheck import loader
from elastic_recheck import results
from elastic_recheck import tests
def fake_queries(filehandle):
def fake_queries(*args):
return [
{'query': '@message:"fake query" AND @fields.filename:"fake"\n',
'bug': 1226337},
@ -73,7 +72,7 @@ class TestClassifier(tests.TestCase):
def setUp(self):
super(TestClassifier, self).setUp()
self.stubs.Set(yaml, 'load', fake_queries)
self.stubs.Set(loader, 'load', fake_queries)
self.classifier = elasticRecheck.Classifier('queries.yaml')
def test_is_ready(self):

View File

@ -26,8 +26,8 @@ class TestElasticRecheck(unit.UnitTestCase):
def test_hits_by_query(self):
c = er.Classifier("queries.yaml")
q = ('''( @message:"Cannot 'createImage'" OR message:"'''
'''Cannot 'createImage'" ) AND filename:"console.html"''')
q = ('''message:"Cannot ''createImage''"'''
''' AND filename:"console.html"''')
results = c.hits_by_query(q)
self.assertEqual(len(results), 20)
self.assertEqual(results.took, 46)

View File

@ -1,95 +0,0 @@
- bug: 1191960
query: >
message:"Exit code: 5"
AND message:" sudo cinder-rootwrap /etc/cinder/rootwrap.conf lvremove -f"
AND filename:"logs/screen-c-vol.txt"
- bug: 1210483
query: >
message:"self.assertTrue(len(addresses) >= 1)"
AND filename:"console.html"
- bug: 1225664
query: >
message:"No space left on device"
AND filename:"logs/syslog.txt"
AND syslog_program:"object-server"
- bug: 1218391
query: >
message:"Cannot 'createImage'"
AND filename:"console.html"
- bug: 1224001
query: >
message:"tempest.scenario.test_network_basic_ops AssertionError: Timed out waiting for"
AND filename:"console.html"
- bug: 1235486
query: >
message:"update or delete on table \"networks\" violates foreign key constraint"
AND filename:"logs/screen-q-svc.txt"
- bug: 1235435
query: >
message:"One or more ports have an IP allocation from this subnet"
AND message:" SubnetInUse: Unable to complete operation on subnet"
AND filename:"logs/screen-q-svc.txt"
- bug: 1239637
query: >
message:"DBError: (IntegrityError) null value in column \"network_id\" violates not-null constraint"
AND filename:"logs/screen-q-svc.txt"
- bug: 1239856
query: >
message:" Registry client request"
AND message:"raised ClientConnectionError"
AND filename:"logs/screen-g-api.txt"
- bug: 1240256
query: >
message:" 503"
AND filename:"logs/syslog.txt"
AND syslog_program:"proxy-server"
- bug: 1225024
query: >
message:"MismatchError: {u'service':" AND message:"not in [{"
AND filename:"console.html"
- bug: 1244255
query: >
message:"NovaException: Unexpected vif_type=binding_failed"
AND filename:"logs/screen-n-cpu.txt"
- bug: 1249065
query: >
message:"No nw_info cache associated with instance"
AND filename:"logs/screen-n-api.txt"
- bug: 1250168
query: >
message:"Details: Timed out waiting for thing"
AND build_name:gate-tempest-devstack-vm-neutron-large-ops
- bug: 1251448
query: >
message:" possible networks found, use a Network ID to be more specific. (HTTP 400)"
AND filename:"console.html"
- bug: 1251784
query: >
message:"Connection to neutron failed: Maximum attempts reached"
AND filename:"logs/screen-n-cpu.txt"
- bug: 1251920
query: >
message:"assertionerror: console output was empty"
AND filename:"console.html"
- bug: 1251512
query: >
message:"test_get_console_output"
AND message:"raise MismatchError(matchee, matcher, mismatch, verbose)"
AND filename:"console.html"
- bug: 1252514
query: >
message:"Got error from Swift: put_object"
AND filename:"logs/screen-g-api.txt"
- bug: 1177134
query: >
message:"[ERROR] /opt/stack/old/devstack/exercises/bundle.sh:61 Image ami-00000001 not available within 15 seconds"
AND filename:"console.html"
- bug: 1253896
query: >
message:"SSHTimeout: Connection to the"
AND message:"via SSH timed out."
AND filename:"console.html"
- bug: 1254872
query: >
message:"libvirtError: Timed out during operation: cannot acquire state change lock"
AND filename:"logs/screen-n-cpu.txt"

4
queries/1177134.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"[ERROR] /opt/stack/old/devstack/exercises/bundle.sh:61
Image ami-00000001 not available within 15 seconds"
AND filename:"console.html"

4
queries/1191960.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"Exit code: 5"
AND message:" sudo cinder-rootwrap /etc/cinder/rootwrap.conf lvremove -f"
AND filename:"logs/screen-c-vol.txt"

3
queries/1210483.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"self.assertTrue(len(addresses) >= 1)"
AND filename:"console.html"

3
queries/1218391.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"Cannot ''createImage''"
AND filename:"console.html"

4
queries/1224001.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"tempest.scenario.test_network_basic_ops AssertionError:
Timed out waiting for"
AND filename:"console.html"

3
queries/1225024.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"MismatchError: {u''service'':" AND message:"not in [{"
AND filename:"console.html"

4
queries/1225664.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"No space left on device"
AND filename:"logs/syslog.txt"
AND syslog_program:"object-server"

4
queries/1235435.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"One or more ports have an IP allocation from this subnet"
AND message:" SubnetInUse: Unable to complete operation on subnet"
AND filename:"logs/screen-q-svc.txt"

3
queries/1235486.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"update or delete on table \"networks\" violates foreign key constraint"
AND filename:"logs/screen-q-svc.txt"

4
queries/1239637.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"DBError: (IntegrityError) null value in column \"network_id\"
violates not-null constraint"
AND filename:"logs/screen-q-svc.txt"

3
queries/1239856.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:" Registry client request" AND message:"raised ClientConnectionError"
AND filename:"logs/screen-g-api.txt"

4
queries/1240256.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:" 503"
AND filename:"logs/syslog.txt"
AND syslog_program:"proxy-server"

3
queries/1244255.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"NovaException: Unexpected vif_type=binding_failed"
AND filename:"logs/screen-n-cpu.txt"

3
queries/1249065.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"No nw_info cache associated with instance"
AND filename:"logs/screen-n-api.txt"

3
queries/1250168.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"Details: Timed out waiting for thing"
AND build_name:gate-tempest-devstack-vm-neutron-large-ops

4
queries/1251448.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:" possible networks found, use a Network ID to be more
specific. (HTTP 400)"
AND filename:"console.html"

4
queries/1251512.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"test_get_console_output"
AND message:"raise MismatchError(matchee, matcher, mismatch, verbose)"
AND filename:"console.html"

3
queries/1251784.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"Connection to neutron failed: Maximum attempts reached"
AND filename:"logs/screen-n-cpu.txt"

3
queries/1251920.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"assertionerror: console output was empty"
AND filename:"console.html"

3
queries/1252514.yaml Normal file
View File

@ -0,0 +1,3 @@
query: >
message:"Got error from Swift: put_object"
AND filename:"logs/screen-g-api.txt"

4
queries/1253896.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"SSHTimeout: Connection to the" AND message:"via SSH timed
out."
AND filename:"console.html"

4
queries/1254872.yaml Normal file
View File

@ -0,0 +1,4 @@
query: >
message:"libvirtError: Timed out during operation: cannot acquire
state change lock"
AND filename:"logs/screen-n-cpu.txt"