Add YODA for undercloud and overcloud workloads

Yoda is a Browbeat workload for Ironic and TripleO

It can perform and monitor the following

 * Introspection bulk or batch
 * Cloud deployment with varying node types and numbers
 * Baremetal node import timing (actually done during introspection tests)

Metrics that are gathered inclue

 * Time to pxe
 * Time till pingable
 * Success/Failure rate ahd times
 * Overcloud Metadata after each deploy

Potential issues

Change-Id: I89809cc35db2cfaa39f8ede49ec853572c0e468e
This commit is contained in:
jkilpatr 2016-10-10 09:54:33 -04:00 committed by Justin Kilpatrick
parent 41681ebcbd
commit b21cd3cebc
20 changed files with 1099 additions and 138 deletions

View File

@ -1,10 +1,14 @@
[
{% for host in groups['controller'] %}
{{hostvars[host]| to_nice_json}},
{% endfor %}
{% for host in groups['compute'] %}
{{hostvars[host]| to_nice_json}},
{% endfor %}
{% if groups['controller'] is defined %}
{% for host in groups['controller'] %}
{{hostvars[host]| to_nice_json}},
{% endfor %}
{% endif %}
{% if groups['compute'] is defined %}
{% for host in groups['compute'] %}
{{hostvars[host]| to_nice_json}},
{% endfor %}
{% endif %}
{% for host in groups['undercloud'] %}
{{hostvars[host]| to_nice_json}}
{% endfor %}

View File

@ -19,6 +19,7 @@
- perfkitbenchmarker
- rally
- shaker
- yoda
- flavors
- images
environment: "{{proxy_env}}"

View File

@ -39,6 +39,9 @@ shaker_venv: "{{home_dir}}/shaker-venv"
# Shaker version to Install
shaker_version: 0.0.17
# The default YODA venv
yoda_venv: /home/stack/yoda-venv
# PerfKitBenchmarker Settings
perfkit_venv: "{{home_dir}}/perfkit-venv"
perfkit_version: v1.12.0

View File

@ -0,0 +1,18 @@
---
#
# YODA Install
#
- name: Create yoda virtualenv
command: virtualenv {{ yoda_venv }} creates={{ yoda_venv }}
- name: Install yoda requirements
pip: name={{item}} virtualenv={{yoda_venv}}
with_items:
- openstacksdk
- python-heatclient
- python-tripleoclient
- elasticsearch
- pykwalify
- python-dateutil
- git+https://github.com/jkilpatr/ostag/#egg=ostag

View File

@ -31,6 +31,7 @@
- browbeat/perfkitbenchmarker
- browbeat/rally
- browbeat/shaker
- browbeat/yoda
- browbeat/flavors
- browbeat/images
- browbeat/browbeat-network
@ -59,6 +60,5 @@
- name: Run Browbeat
hosts: undercloud
roles:
- browbeat/bug-check
- browbeat/grafana-dashboard-setup
- browbeat/browbeat-run

View File

@ -20,6 +20,7 @@
- browbeat/perfkitbenchmarker
- browbeat/rally
- browbeat/shaker
- browbeat/yoda
- browbeat/flavors
- browbeat/images
- browbeat/browbeat-network

View File

@ -4,4 +4,4 @@
shell:
"source {{ ansible_env.HOME }}/browbeat-venv/bin/activate; \
cd {{ ansible_env.HOME }}/browbeat/; \
python browbeat.py rally > {{ ansible_env.HOME }}/browbeat/results/browbeat_run.log"
python browbeat.py all > {{ ansible_env.HOME }}/browbeat/results/browbeat_run.log"

View File

@ -40,23 +40,6 @@ grafana:
snapshot:
enabled: false
snapshot_compute: false
perfkit:
enabled: true
sleep_before: 0
sleep_after: 0
venv: /home/stack/perfkit-venv/bin/activate
default:
image: centos7
machine_type: m1.small
os_type: rhel
openstack_image_username: centos
openstack_floating_ip_pool: browbeat_public
openstack_network: nova_test_net_name.stdout
benchmarks:
- name: fio-centos-m1-small
enabled: false
benchmarks: fio
data_disk_size: 4
rally:
enabled: true
sleep_before: 5
@ -138,104 +121,3 @@ rally:
sla_max_avg_duration: 12
sla_max_seconds: 30
sla_max_failure: 0
#shaker scenarios require at least 2 compute nodes
shaker:
enabled: true
server: localhost
port: 5555
flavor: m1.small
join_timeout: 600
sleep_before: 5
sleep_after: 5
venv: /home/stack/shaker-venv
dns_nameserver: 192.168.23.1
shaker_region: regionOne
scenarios:
- name: l2-4-1
enabled: true
density: 4
compute: 1
progression: linear
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l2.yaml
- name: l2-8-1
enabled: true
density: 8
compute: 1
progression: linear
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l2.yaml
- name: l2-4-2
enabled: true
density: 4
compute: 2
progression: linear
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l2.yaml
- name: l2-4-8
enabled: true
density: 8
compute: 2
progression: linear
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l2.yaml
- name: l3-north-south-4-1
enabled: true
placement: single_room
density: 4
compute: 1
progression: null
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_north_south.yaml
- name: l3-north-south-8-1
enabled: false
placement: single_room
density: 8
compute: 1
progression: null
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_north_south.yaml
- name: l3-north-south-4-2
enabled: true
placement: single_room
density: 4
compute: 2
progression: null
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_north_south.yaml
- name: l3-north-south-8-2
enabled: true
placement: single_room
density: 8
compute: 2
progression: null
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_north_south.yaml
- name: l3-east-west-4-1
enabled: true
density: 4
compute: 1
placement: single_room
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_east_west.yaml
- name: l3-east-west-8-1
enabled: true
density: 8
compute: 1
placement: single_room
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_east_west.yaml
- name: l3-east-west-4-2
enabled: true
density: 4
compute: 2
placement: single_room
time: 60
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_east_west.yaml
- name: l3-east-west-8-2
enabled: true
density: 8
compute: 2
time: 60
placement: single_room
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_east_west.yaml

View File

@ -0,0 +1,182 @@
# Tests to be compleated for the install-and-check.sh script minimal and short workloads are performed
# to confirm functionality.
browbeat:
results : results/
rerun: 1
cloud_name: {{ browbeat_cloud_name }}
elasticsearch:
enabled: {{ elastic_enabled_template }}
host: {{ elastic_host_template }}
port: 9200
regather: true
metadata_files:
- name: hardware-metadata
file: metadata/hardware-metadata.json
- name: environment-metadata
file: metadata/environment-metadata.json
- name: software-metadata
file: metadata/software-metadata.json
- name: version
file: metadata/version.json
ansible:
ssh_config: ansible/ssh-config
hosts: ansible/hosts
adjust:
keystone_token: ansible/browbeat/adjustment-keystone-token.yml
neutron_l3: ansible/browbeat/adjustment-l3.yml
nova_db: ansible/browbeat/adjustment-db.yml
workers: ansible/browbeat/adjustment-workers.yml
grafana_snapshot: ansible/browbeat/snapshot-general-performance-dashboard.yml
metadata: ansible/gather/site.yml
connmon:
enabled: {{ connmon_enabled_template }}
sudo: true
grafana:
enabled: {{ grafana_enabled_template }}
grafana_ip: {{ grafana_host_template }}
grafana_port: 3000
dashboards:
- openstack-general-system-performance
snapshot:
enabled: false
snapshot_compute: false
yoda:
enabled: true
instackenv: "/home/stack/instackenv.json"
stackrc: "/home/stack/stackrc"
venv: "/home/stack/yoda-venv/bin/activate"
benchmarks:
- name: introspect-{{ overcloud_size }}-10-individual-batch-2
type: introspection
enabled: true
method: individual
times: 10
timeout: 900
batch_size: 2
- name: introspect-{{ overcloud_size }}-10-individual-batch-4
type: introspection
enabled: true
method: individual
times: 10
timeout: 900
batch_size: 4
- name: introspect-{{ overcloud_size }}-10-individual-batch-8
type: introspection
enabled: true
method: individual
times: 10
timeout: 900
batch_size: 8
- name: introspect-{{ overcloud_size }}-10-individual-batch-16
type: introspection
enabled: true
method: individual
times: 10
timeout: 900
batch_size: 16
- name: introspect-{{ overcloud_size }}-10-individual-batch-32
type: introspection
enabled: true
method: individual
times: 10
timeout: 900
batch_size: 32
- name: introspect-{{ overcloud_size }}-10-individual-batch-{{ overcloud_size }}
type: introspection
enabled: true
method: individual
times: 10
timeout: 900
batch_size: {{ overcloud_size }}
- name: introspect-{{ overcloud_size }}-50-bulk
type: introspection
enabled: true
method: bulk
times: 50
timeout: 900
- name: No-HA-Max-Compute-{{ overcloud_size }}-full-deploy
type: overcloud
ntp_server: clock01.util.phx2.redhat.com
timeout: 600
templates:
- ""
enabled: true
step: 5
keep_stack: false
times: 2
cloud:
- node: "compute"
start_scale: 1
end_scale: {{ overcloud_size | int - 1 }}
- node: "control"
start_scale: 1
end_scale: 1
- name: No-HA-Max-Compute-{{ overcloud_size }}-stack-update
type: overcloud
ntp_server: clock01.util.phx2.redhat.com
timeout: 600
templates:
- ""
instackenv: "/home/stack/instackenv.json"
enabled: true
step: 5
keep_stack: true
times: 2
cloud:
- node: "compute"
start_scale: 1
end_scale: {{ overcloud_size | int - 1 }}
- node: "control"
start_scale: 1
end_scale: 1
- name: HA-Max-Compute-{{ overcloud_size }}-full-deploy
type: overcloud
ntp_server: clock01.util.phx2.redhat.com
timeout: 600
templates:
- ""
enabled: true
step: 5
keep_stack: false
times: 2
cloud:
- node: "compute"
start_scale: 1
end_scale: {{ overcloud_size | int - 3 }}
- node: "control"
start_scale: 3
end_scale: 3
- name: HA-Max-Compute-{{ overcloud_size }}-stack-update
type: overcloud
ntp_server: clock01.util.phx2.redhat.com
timeout: 600
templates:
- ""
enabled: true
step: 5
keep_stack: true
times: 2
cloud:
- node: "compute"
start_scale: 1
end_scale: {{ overcloud_size | int - 3 }}
- node: "control"
start_scale: 3
end_scale: 3
- name: HA-Max-Compute-{{ overcloud_size }}-stack-update
type: overcloud
ntp_server: clock01.util.phx2.redhat.com
timeout: 600
templates:
- ""
enabled: true
step: 5
keep_stack: true
times: 2
cloud:
- node: "compute"
start_scale: 1
end_scale: {{ overcloud_size | int - 3 }}
- node: "control"
start_scale: 3
end_scale: 3

View File

@ -5,3 +5,5 @@ grafana_enabled_template: false
grafana_host_template: "1.2.3.4.5"
browbeat_config_file: "browbeat-basic.yaml.j2"
browbeat_cloud_name: "browbeat_ci"
overcloud_size: "{{ groups['overcloud'] | length }}"
ntp_server: "pool.ntp.org"

View File

@ -369,3 +369,34 @@ shaker:
time: 60
placement: single_room
file: lib/python2.7/site-packages/shaker/scenarios/openstack/dense_l3_east_west.yaml
#yoda scenarios WILL redeploy your overcloud
yoda:
enabled: false
instackenv: "/home/stack/instackenv.json"
stackrc: "/home/stack/stackrc"
venv: "/home/stack/yoda-venv/bin/activate"
benchmarks:
- name: scale-deploy
type: overcloud
ntp_server: pool.ntp.org
enabled: true
templates:
- ""
timeout: 600 #deploy timeout in minutes
step: 1
keep_stack: false
times: 3
cloud:
- node: "compute"
start_scale: 1
end_scale: 1
- node: "control"
start_scale: 1
end_scale: 3
- name: introspect-batch
type: introspection
enabled: true
method: individual #other option is bulk
times: 3
timeout: 900 #introspection timeout in seconds
batch_size: 2

View File

@ -15,6 +15,7 @@ from lib.Elastic import browbeat_uuid
import lib.PerfKit
import lib.Rally
import lib.Shaker
import lib.Yoda
import lib.WorkloadBase
import lib.Tools
import argparse
@ -24,7 +25,7 @@ import time
import datetime
import os
_workload_opts = ['perfkit', 'rally', 'shaker']
_workload_opts = ['perfkit', 'rally', 'shaker', 'yoda']
_config_file = 'browbeat-config.yaml'
debug_log_file = 'log/debug.log'

View File

@ -143,6 +143,76 @@ using some simple searches such as:
shaker_uuid: 97092334-34e8-446c-87d6-6a0f361b9aa8 AND record.concurrency: 1 AND result.result_type: bandwidth
shaker_uuid: c918a263-3b0b-409b-8cf8-22dfaeeaf33e AND record.concurrency:1 AND record.test:Bi-Directional
Running YODA
============
YODA (Yet Openstack Deployment tool, Another) is a workload integrated into
Browbeat for benchmarking TripleO deployment. This includes importing baremetal
nodes, running introspections and overcloud deployements of various kinds. Note
that YODA assumes it is on the undercloud of a TripleO instance post undercloud
installation and introspection.
Configuration
-------------
For examples of the configuration see `browbeat-complete.yaml` in the repo root directory.
Additional configuration documentation can be found below for each subworkload of YODA.
Overcloud
~~~~~~~~~
For overcloud workloads, note that the nodes dictionary is dynamic, so you don't
have to define types you aren't using, this is done in the demonstration
configurations for the sake of completeness. Furthermore the node name is taken
from the name of the field, meaning custom role names should work fine there.
The step parameter decides how many nodes can be distributed between the various
types to get from start scale to end scale, if these are the same it won't
matter. But if they are different up to that many nodes will be distributed to
the different node types (in no particular order) before the next deploy is
performed. The step rule is violated if and only if it is required to keep the
deployment viable, for example if the step dictates that 2 control nodes be
deployed it will skip to 3 even if it violates step.
YODA has basic support for custom templates and more advanced roles, configure the
`templates:` paramater in the overcloud benchmark section with a string for
template paths.
templates: "-e /usr/share/openstack-tripleo-heat-templates/environments/network-isolation.yaml"
Note that `--templates` is passed to the `overcloud deploy` command before this,
then nodes sizes, ntp server and timeout are passed after, so your templates
will override the defaults, but not scale, timeout, or ntp settings from the
YODA config. If you want to use scheduling hints for your overcloud deploy you
will need to pip install [ostag](https://github.com/jkilpatr/ostag) and set
`node_pinning: True` in your config file. Ostag will be used before every deploy
to clean all tags and tag the appropriate nodes. If you set `node_pinning: False`
tags will be cleaned before the deploy. If you need more advanced features view
the ostag readme for how to tag based on node properties. If you don't want YODA
to edit your node properties, don't define `node_pinning` in your configuration.
Introspection
~~~~~~~~~~~~~
Introspection workloads have two modes, batch and individual, the batch workload
follows the documentation exactly, nodes are imported, then bulk introspection
is run. Individual introspection has it's own custom batch size and handles
failures more gracefully (individual instead of group retries). Both have a
timeout configured in seconds and record the amount of time required for each
node to pxe and the number of failures.
`timeout` is how long we wait for the node to come back from introspection this is
hardware variable. Although the default 900 seconds has been shown to be the 99th
percentile for success across at least two stes of hardware. Adjust as required.
Note that `batch_size` can not produce a batch of unintrospected ndoes if none exist
so the last batch may be below the maximum size. When nodes in a batch fail the `failure_count`
is incremented and the nodes are returned to the pool. So it's possible that same node will
fail again in another batch. There is a saftey mechanism that will kill Yoda if a node exceeds
10 retries as that's pretty much garunteed to be misconfigured. For bulk introspection all nodes
are tried once and what you get is what you get.
If you wish to change the introspection workload failure threshold of 10% you can
set `max_fail_amnt` to any floating point value you desire.
I would suggest bulk introspection for testing documented TripleO workflows and
individual introspection to test the performance of introspection itself.
Interpreting Browbeat Results
=============================

View File

@ -61,7 +61,7 @@ class Connmon(object):
def connmon_graphs(self, result_dir, test_name):
cmd = "python graphing/connmonplot.py {}/connmon/{}.csv".format(result_dir,
test_name)
return self.tools.run_cmd(cmd)
return self.tools.run_cmd(cmd)['stdout']
# Move connmon results
def move_connmon_results(self, result_dir, test_name):

View File

@ -66,7 +66,7 @@ class Rally(WorkloadBase.WorkloadBase):
cmd += "rally {} task start {} --task-args \'{}\' 2>&1 | tee {}.log".format(
plugin_string, task_file, task_args, test_name)
from_time = time.time()
self.tools.run_cmd(cmd)
self.tools.run_cmd(cmd)['stdout']
to_time = time.time()
if 'sleep_after' in self.config['rally']:
time.sleep(self.config['rally']['sleep_after'])
@ -93,7 +93,7 @@ class Rally(WorkloadBase.WorkloadBase):
def get_task_id(self, test_name):
cmd = "grep \"rally task results\" {}.log | awk '{{print $4}}'".format(
test_name)
return self.tools.run_cmd(cmd)
return self.tools.run_cmd(cmd)['stdout']
def _get_details(self):
self.logger.info(
@ -111,17 +111,17 @@ class Rally(WorkloadBase.WorkloadBase):
cmd = "source {}; ".format(self.config['rally']['venv'])
cmd += "rally task report --task {} --out {}.html".format(
all_task_ids, test_name)
return self.tools.run_cmd(cmd)
return self.tools.run_cmd(cmd)['stdout']
def gen_scenario_json(self, task_id):
cmd = "source {}; ".format(self.config['rally']['venv'])
cmd += "rally task results {}".format(task_id)
return self.tools.run_cmd(cmd)
return self.tools.run_cmd(cmd)['stdout']
def gen_scenario_json_file(self, task_id, test_name):
cmd = "source {}; ".format(self.config['rally']['venv'])
cmd += "rally task results {} > {}.json".format(task_id, test_name)
return self.tools.run_cmd(cmd)
return self.tools.run_cmd(cmd)['stdout']
def rally_metadata(self, result, meta):
result['rally_metadata'] = meta

View File

@ -39,7 +39,7 @@ class Shaker(WorkloadBase.WorkloadBase):
def shaker_checks(self):
cmd = "source /home/stack/overcloudrc; glance image-list | grep -w shaker-image"
if self.tools.run_cmd(cmd) == "":
if self.tools.run_cmd(cmd)['stdout'] == "":
self.logger.error("Shaker Image is not built, try again")
exit(1)
else:

View File

@ -13,6 +13,7 @@
import PerfKit
import Rally
import Shaker
import Yoda
import logging
import os
import subprocess
@ -29,16 +30,37 @@ class Tools(object):
self.config = config
return None
# Returns true if ping successful, false otherwise
def is_pingable(self, ip):
cmd = "ping -c1 " + ip
result = self.run_cmd(cmd)
if result['rc'] == 0:
return True
else:
return False
# Run command async from the python main thread, return Popen handle
def run_async_cmd(self, cmd):
FNULL = open(os.devnull, 'w')
self.logger.debug("Running command : %s" % cmd)
process = subprocess.Popen(cmd, shell=True, stdout=FNULL)
return process
# Run command, return stdout as result
def run_cmd(self, cmd):
self.logger.debug("Running command : %s" % cmd)
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if len(stderr) > 0:
return None
else:
return stdout.strip()
output_dict = {}
output_dict['stdout'] = stdout.strip()
output_dict['stderr'] = stderr.strip()
output_dict['rc'] = process.returncode
if process.returncode > 0:
self.logger.error("Command {} returned with error".format(cmd))
self.logger.error("stdout: {}".format(stdout))
self.logger.error("stderr: {}".format(stderr))
return output_dict
# Find Command on host
def find_cmd(self, cmd):
@ -103,6 +125,9 @@ class Tools(object):
elif provider == "shaker":
shaker = Shaker.Shaker(self.config)
shaker.run_shaker()
elif provider == "yoda":
yoda = Yoda.Yoda(self.config)
yoda.start_workloads()
else:
self.logger.error("Unknown workload provider: {}".format(provider))
@ -118,6 +143,7 @@ class Tools(object):
def gather_metadata(self):
os.putenv("ANSIBLE_SSH_ARGS",
" -F {}".format(self.config['ansible']['ssh_config']))
ansible_cmd = \
'ansible-playbook -i {} {}' \
.format(self.config['ansible']['hosts'], self.config['ansible']['metadata'])
@ -175,3 +201,15 @@ class Tools(object):
if workload is "perfkit":
# Stub for PerfKit.
continue
def load_stackrc(self, filepath):
values = {}
with open(filepath) as stackrc:
for line in stackrc:
pair = line.split('=')
if 'export' not in line and '#' not in line and '$(' not in line:
values[pair[0].strip()] = pair[1].strip()
elif '$(' in line and 'for key' not in line:
values[pair[0].strip()] = \
self.run_cmd("echo " + pair[1].strip())['stdout'].strip()
return values

643
lib/Yoda.py Normal file
View File

@ -0,0 +1,643 @@
#!/usr/bin/env python
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Yet another cloud deployment tool
import datetime
import Elastic
import Grafana
import json
import logging
import time
import Tools
import WorkloadBase
from openstack import connection
from openstack import exceptions
import os
import requests
try:
from ostag import ostag
except ImportError:
ostag = None
from collections import deque
class Yoda(WorkloadBase.WorkloadBase):
def __init__(self, config):
self.logger = logging.getLogger('browbeat.yoda')
self.config = config
self.tools = Tools.Tools(self.config)
self.grafana = Grafana.Grafana(self.config)
self.elastic = Elastic.Elastic(self.config, self.__class__.__name__.lower())
self.error_count = 0
self.pass_count = 0
self.test_count = 0
self.scenario_count = 0
def get_stats(self):
self.logger.info(
"Current number of YODA tests executed: {}".format(
self.test_count))
self.logger.info(
"Current number of YODA tests passed: {}".format(
self.pass_count))
self.logger.info(
"Current number of YODA tests failed: {}".format(
self.error_count))
def update_tests(self):
self.test_count += 1
self.update_total_tests()
def update_pass_tests(self):
self.pass_count += 1
self.update_total_pass_tests()
def update_fail_tests(self):
self.error_count += 1
self.update_total_fail_tests()
def update_scenarios(self):
self.scenario_count += 1
self.update_total_scenarios()
def state_tracker_extend(self, state, state_list):
if state is None:
return state_list
elif state_list is None:
return [state]
elif state in state_list[-1]:
return state_list
else:
state_list.append(state)
return state_list
def node_is_cleaning(self, provision_state):
ret = provision_state is not None
ret = ret and 'clean' in provision_state
ret = ret and 'fail' not in provision_state
return ret
def is_cleaning(self, conn):
for node in conn.bare_metal.nodes():
if self.node_is_cleaning(node.provision_state):
return True
return False
def failed_cleaning_count(self, conn):
count = 0
for node in conn.bare_metal.nodes():
if self.node_is_cleaning(node.provision_state):
count += 1
return count
def wait_for_clean(self, env_setup, conn):
wait_time = 1
# 15 minute timeout
timeout = (60 * 15)
while self.is_cleaning(conn):
# Cleans can fail, so we just try again
if wait_time % 1000 == 0:
self.set_ironic_node_state("manage", env_setup, conn)
time.sleep(30)
self.set_ironic_node_state("provide", env_setup, conn)
time.sleep(1)
wait_time += 1
if wait_time > timeout:
self.logger.error("Node Cleaning failed")
exit(1)
# Required to use console commands because of this bug
# https://bugs.launchpad.net/python-openstacksdk/+bug/1668767
def set_ironic_node_state(self, state, env_setup, conn, node_uuid=""):
if node_uuid != "":
nodes = [node_uuid]
else:
nodes = deque(map(lambda node: node.id, conn.bare_metal.nodes()))
if state == "manage":
cmd_base = "{} openstack baremetal node manage {}"
for _ in range(len(nodes)):
node = nodes.pop()
node_obj = conn.bare_metal.get_node(node)
if "manage" not in node_obj.provision_state:
nodes.append(node)
elif state == "provide":
cmd_base = "{} openstack baremetal node provide {}"
for _ in range(len(nodes)):
node = nodes.pop()
node_obj = conn.bare_metal.get_node(node)
prov_state = node_obj.provision_state
if prov_state is not None and "available" not in prov_state:
nodes.append(node)
elif state == "inspect":
cmd_base = "{} openstack baremetal introspection start {}"
elif state == "off":
cmd_base = "{} openstack baremetal node power off {}"
for _ in range(len(nodes)):
node = nodes.pop()
node_obj = conn.bare_metal.get_node(node)
if "off" not in node_obj.power_state:
nodes.append(node)
elif state == "on":
cmd_base = "{} openstack baremetal node power on {}"
for _ in range(len(nodes)):
node = nodes.pop()
node_obj = conn.bare_metal.get_node(node)
if "on" not in node_obj.power_state:
nodes.append(node)
elif state == "delete":
cmd_base = "{} openstack baremetal node delete {}"
else:
self.logger.error("set_ironic_node_state() called with invalid state")
exit(1)
for node in nodes:
cmd = cmd_base.format(env_setup, node)
self.tools.run_async_cmd(cmd)
time.sleep(.5)
# Gathers metrics on the instack env import
def import_instackenv(self, filepath, env_setup, conn):
results = {}
filepath = os.path.abspath(os.path.expandvars(filepath))
cmd = "{} openstack overcloud node import {}".format(env_setup, filepath)
start_time = datetime.datetime.utcnow()
out = self.tools.run_cmd(cmd)
nodes = conn.bare_metal.nodes()
for node in nodes:
while 'enroll' in node.provision_state:
node = conn.bare_metal.get_node(node)
time.sleep(1)
end_time = datetime.datetime.utcnow()
results['import_time'] = (end_time - start_time).total_seconds()
if out['stderr'] == '' or 'Error' not in out['stderr']:
results['import_status'] = "success"
else:
results['import_status'] = "failure"
self.logger.error("Instackenv import returned 1, printing stderr")
self.logger.error(out['stderr'])
return results
# Introspection with exactly the documented workflow
def introspection_bulk(self, timeout, env_setup, conn):
results = {}
nodes = deque(map(lambda node: node.id, conn.bare_metal.nodes()))
cmd = "{} openstack overcloud node introspect --all-manageable".format(env_setup)
results['nodes'] = {}
for node in conn.bare_metal.nodes(details=True):
results['nodes'][node.id] = {}
results['nodes'][node.id]["last_error"] = node.last_error
results['nodes'][node.id]["driver"] = node.driver
results['nodes'][node.id]["driver_info"] = node.driver_info
results['nodes'][node.id]["properties"] = node.properties
results['nodes'][node.id]["failures"] = 0
results['nodes'][node.id]["state_list"] = None
self.tools.run_async_cmd(cmd)
out = self.watch_introspecting_nodes(nodes, timeout, conn, results)
failed = out[0]
results['raw'] = out[1]
results["failure_count"] = len(failed)
return results
def watch_introspecting_nodes(self, nodes, timeout, conn, results):
start_time = datetime.datetime.utcnow()
times = []
timeout = datetime.timedelta(seconds=timeout)
while len(nodes):
node = nodes.pop()
# rate limit
time.sleep(1)
node_obj = conn.bare_metal.get_node(node)
if node_obj is None:
self.logger.error("Can't find node " + node +
" Which existed at the start of introspection \
did you delete it manually?")
continue
# == works here for string comparison because they are in fact
# the same object if not changed
stored_properties = str(results['nodes'][node_obj.id]["properties"])
node_properties = str(node_obj.properties)
changed = not stored_properties == node_properties
powered_off = 'off' in node_obj.power_state
not_cleaning = 'clean' not in node_obj.provision_state
if changed and powered_off and not_cleaning:
results['nodes'][node_obj.id]["properties"] = node_obj.properties
results['nodes'][node_obj.id]["state_list"] = \
self.state_tracker_extend(node_obj.provision_state,
results['nodes'][node_obj.id]["state_list"])
times.append((datetime.datetime.utcnow() - start_time).total_seconds())
elif (datetime.datetime.utcnow() - start_time) > timeout:
for node in nodes:
node_obj = conn.bare_metal.get_node(node)
results['nodes'][node_obj.id]['failures'] += 1
if results['nodes'][node_obj.id]['failures'] > 10:
self.logger.error("Node "
+ node_obj.id
+ "has failed more than 10 introspections")
self.logger.error("This probably means it's misconfigured, exiting")
exit(1)
break
else:
results['nodes'][node_obj.id]["state_list"] = \
self.state_tracker_extend(node_obj.provision_state,
results['nodes'][node_obj.id]["state_list"])
nodes.appendleft(node)
return (nodes, times)
# Introspection with robust failure handling
def introspection_individual(self, batch_size, timeout, env_setup, conn):
nodes = deque(map(lambda node: node.id, conn.bare_metal.nodes()))
failure_count = 0
batch = deque()
results = {}
results['raw'] = []
results['nodes'] = {}
for node in conn.bare_metal.nodes(details=True):
results['nodes'][node.id] = {}
results['nodes'][node.id]["last_error"] = node.last_error
results['nodes'][node.id]["driver"] = node.driver
results['nodes'][node.id]["driver_info"] = node.driver_info
results['nodes'][node.id]["properties"] = node.properties
results['nodes'][node.id]["failures"] = 0
results['nodes'][node.id]["state_list"] = None
while len(nodes):
node = nodes.pop()
self.set_ironic_node_state("inspect", env_setup, conn, node)
batch.append(node)
if len(batch) >= batch_size or (len(nodes) == 0 and len(batch) != 0):
out = self.watch_introspecting_nodes(batch, timeout, conn, results)
failed = out[0]
results['raw'].extend(out[1])
failure_count = failure_count + len(failed)
nodes.extend(failed)
batch.clear()
results["failure_count"] = failure_count
return results
def delete_stack(self, conn):
wait_time = 0
# 30 minute timeout
timeout = (60 * 30)
while conn.orchestration.find_stack("overcloud") is not None:
# Deletes can fail, so we just try again
if wait_time % 2000 == 0:
conn.orchestration.delete_stack("overcloud")
time.sleep(5)
wait_time += 5
if wait_time > timeout:
self.logger.error("Overcloud stack delete failed")
exit(1)
def setup_nodes_dict(self, benchmark):
nodes = {}
for service in benchmark['cloud']:
nodes[service['node']] = service['start_scale']
nodes["previous_" + service['node']] = -1
return nodes
def update_nodes_dict(self, benchmark, nodes, changed):
# update settings for next round, note if changes are made
step = benchmark['step']
nodes_added = 0
for service in benchmark['cloud']:
node_type = service['node']
end_scale = service['end_scale']
nodes["previous_" + node_type] = nodes[node_type]
if nodes[node_type] < end_scale:
difference = end_scale - nodes[node_type]
allowed_difference = step - nodes_added
add = min(difference, allowed_difference)
nodes[node_type] += add
nodes_added += add
changed = True
# edge cases, note we must round up otherwise we get
# stuck forever if step is 1, this also means we must
# violate the step rules to both ensure a valid deployment
# and progression
if 'control' in nodes and nodes['control'] == 2:
nodes['control'] = 3
if 'ceph' in nodes and nodes['ceph'] > 0 and nodes['ceph'] < 3:
nodes['ceph'] = 3
return (nodes, changed)
def deploy_overcloud(self, start_time, results, ntp_server, conn, env_setup, benchmark):
if type(ntp_server) != str:
self.logger.error("Please configure an NTP server!")
exit(1)
cmd = env_setup + "openstack overcloud deploy --templates "
for template in benchmark['templates']:
cmd = cmd + " " + template + " "
for service in benchmark['cloud']:
cmd = cmd + " --" + service['node'] + "-scale " + str(results[service['node']])
cmd = cmd + " --timeout=" + str(benchmark['timeout']) + " --ntp-server=" + str(ntp_server)
self.logger.debug("Openstack deployment command is " + cmd)
results["overcloud_deploy_command"] = cmd
deploy_process = self.tools.run_async_cmd(cmd)
results['cleaning_failures'] = self.failed_cleaning_count(conn)
results['nodes'] = {}
while deploy_process.poll() is None:
time.sleep(5)
try:
for node in conn.compute.servers():
time.sleep(1)
# look for new instances to add to our metadata
if node.name not in results['nodes']:
results['nodes'][node.name] = {}
create_time = datetime.datetime.strptime(node.created_at,
"%Y-%m-%dT%H:%M:%SZ")
results['nodes'][node.name]['created_at'] = \
(create_time - start_time).total_seconds()
results['nodes'][node.name]['scheduler_hints'] = \
node.scheduler_hints
results['nodes'][node.name]['state_list'] = None
# try and figure out which baremetal node this
# instance is scheduled on
if 'bm_node' not in results['nodes'][node.name]:
try:
bm_node = next(conn.bare_metal.nodes(details=True,
instance_id=node.id))
results['nodes'][node.name]['bm_node'] = \
bm_node.id
results['nodes'][node.name]['bm_node_properties'] = \
bm_node.properties
results['nodes'][node.name]['bm_node_driver'] = \
bm_node.driver
results['nodes'][node.name]['bm_last_error'] = \
bm_node.last_error
except StopIteration:
continue
update_time = datetime.datetime.strptime(node.updated_at,
"%Y-%m-%dT%H:%M:%SZ")
results['nodes'][node.name]['last_updated_at'] = \
(update_time - start_time).total_seconds()
results['nodes'][node.name]['final_status'] = node.status
bm_node = next(conn.bare_metal.nodes(details=True,
instance_id=node.id))
state_list = results['nodes'][node.name]['state_list']
state_list = \
self.state_tracker_extend(bm_node.provision_state,
state_list)
rentry = results['nodes'][node.name]
# Populate this field so it gets indexed every time
# even if nodes are never pingable
rentry['ping_time'] = -1
condition = 'private' in node.addresses
condition = condition and 'pingable_at' not in rentry
ping = self.tools.is_pingable(node.addresses['private'])
condition = condition and ping
if condition:
ping_time = datetime.datetime.utcnow()
rentry['ping_time'] = (ping_time - start_time).total_seconds()
except exceptions.HttpException:
self.logger.error("OpenStack bare_metal API is returning NULL")
self.logger.error("This sometimes happens during stack creates")
return results
def elastic_insert(self, results, run, start_time, benchmark, results_dir):
scenario_name = benchmark['name']
results['action'] = scenario_name.strip()
results['browbeat_rerun'] = run
results['timestamp'] = str(start_time).replace(" ","T")
results['grafana_url'] = self.grafana.grafana_urls()
results['scenario'] = benchmark['name']
results['scenario_config'] = benchmark
# Create list of objects for Elastic insertion rather than
# dict of dicts. Insert key to not lose name data
nodes_data = []
for key in results['nodes']:
results['nodes'][key]['name'] = key
nodes_data.append(results['nodes'][key])
results['nodes'] = nodes_data
results = self.elastic.combine_metadata(results)
if not self.elastic.index_result(results, scenario_name, results_dir):
self.update_index_failures()
def dump_scenario_json(self, results_dir, json, time):
with open(results_dir + "/" + str(time).strip() + ".json", 'w') as outfile:
outfile.write(json)
def setup_scenario(self, benchmark_name, dir_ts):
results_dir = self.tools.create_results_dir(self.config['browbeat']['results'],
dir_ts,
benchmark_name,
benchmark_name)
if type(results_dir) is bool:
self.logger.error("Malformed Config, benchmark names must be unique!")
exit(1)
self.logger.debug("Created result directory: {}".format(results_dir))
workload = self.__class__.__name__
self.workload_logger(results_dir, workload)
return results_dir
def introspection_workload(self, benchmark, run, results_dir, env_setup, conn):
self.delete_stack(conn)
self.wait_for_clean(env_setup, conn)
test_start = datetime.datetime.utcnow()
self.wait_for_clean(env_setup, conn)
self.set_ironic_node_state("delete", env_setup, conn)
while len(list(conn.bare_metal.nodes())) > 0:
time.sleep(5)
import_results = self.import_instackenv(benchmark['instackenv'], env_setup, conn)
self.set_ironic_node_state("manage", env_setup, conn)
self.set_ironic_node_state("off", env_setup, conn)
if benchmark['method'] == "individual":
introspection_results = self.introspection_individual(benchmark['batch_size'],
benchmark['timeout'],
env_setup, conn)
elif benchmark['method'] == "bulk":
introspection_results = self.introspection_bulk(benchmark['timeout'], env_setup, conn)
else:
self.logger.error("Malformed YODA configuration for " + benchmark['name'])
exit(1)
self.get_stats()
# Combines dicts but mutates introspection_results rather than
# returning a new value
import_results.update(introspection_results)
results = import_results
results['total_nodes'] = len(list(map(lambda node: node.id, conn.bare_metal.nodes())))
# If maximum failure precentage is not set, we set it to 10%
if 'max_fail_amnt' not in benchmark:
benchmark['max_fail_amnt'] = .10
if results['failure_count'] >= results['total_nodes'] * benchmark['max_fail_amnt']:
self.update_fail_tests()
else:
self.update_pass_tests()
self.update_tests()
self.dump_scenario_json(results_dir, json.dumps(results), test_start)
if self.config['elasticsearch']['enabled']:
self.elastic_insert(results, run, test_start, benchmark, results_dir)
def overcloud_workload(self, benchmark, run, results_dir, env_setup, conn):
if conn.orchestration.find_stack("overcloud") is None:
self.set_ironic_node_state("provide", env_setup, conn)
self.wait_for_clean(env_setup, conn)
keep_stack = benchmark['keep_stack']
results = self.setup_nodes_dict(benchmark)
changed = True
while changed:
changed = False
# Can't scale from HA to non HA or back
control_change = results['control'] != results['previous_control']
if keep_stack and not control_change:
results['method'] = "update"
else:
self.delete_stack(conn)
self.wait_for_clean(env_setup, conn)
results['method'] = "new"
start_time = datetime.datetime.utcnow()
if 'node_pinning' in benchmark:
if ostag is None:
self.logger.error("ostag is not installed please run")
self.logger.error(" pip install git+https://github.com/jkilpatr/ostag")
self.logger.error("Pinning not used in this test!")
elif benchmark['node_pinning']:
ostag.clear_tags(conn)
for node in benchmark['cloud']:
ostag.mark_nodes("", node['node'], conn, False, "", node['end_scale'])
else:
ostag.clear_tags(conn)
results = self.deploy_overcloud(start_time, results,
benchmark['ntp_server'],
conn, env_setup,
benchmark)
results['total_time'] = (datetime.datetime.utcnow() - start_time).total_seconds()
results['result'] = str(conn.orchestration.find_stack("overcloud").status)
results['result_reason'] = str(conn.orchestration.find_stack("overcloud").status_reason)
results['total_nodes'] = len(list(map(lambda node: node.id, conn.bare_metal.nodes())))
if "COMPLETE" in results['result']:
self.update_pass_tests()
else:
self.update_fail_tests()
self.update_tests
self.get_stats()
self.tools.gather_metadata()
self.dump_scenario_json(results_dir, json.dumps(results), start_time)
if self.config['elasticsearch']['enabled']:
self.elastic_insert(results, run, start_time, benchmark, results_dir)
out = self.update_nodes_dict(benchmark, results, changed)
results = out[0]
changed = out[1]
def start_workloads(self):
"""Iterates through all yoda scenarios in browbeat yaml config file"""
self.logger.info("Starting YODA workloads")
es_ts = datetime.datetime.utcnow()
dir_ts = es_ts.strftime("%Y%m%d-%H%M%S")
self.logger.debug("Time Stamp (Prefix): {}".format(dir_ts))
stackrc = self.config.get('yoda')['stackrc']
venv = self.config.get('yoda')['venv']
env_setup = "source {}; source {};".format(stackrc,venv)
auth_vars = self.tools.load_stackrc(stackrc)
if 'OS_AUTH_URL' not in auth_vars:
self.logger.error("Please make sure your stackrc is configured correctly")
exit(1)
auth_args = {
'auth_url': auth_vars['OS_AUTH_URL'],
'project_name': 'admin',
'username': auth_vars['OS_USERNAME'],
'password': auth_vars['OS_PASSWORD'],
'verify': False
}
requests.packages.urllib3.disable_warnings()
conn = connection.Connection(**auth_args)
instackenv = self.config.get('yoda')['instackenv']
benchmarks = self.config.get('yoda')['benchmarks']
if (benchmarks is not None and len(benchmarks) > 0):
for benchmark in benchmarks:
if benchmark['enabled']:
results_dir = self.setup_scenario(benchmark['name'], dir_ts)
times = benchmark['times']
if 'instackenv' not in benchmark:
benchmark['instackenv'] = instackenv
for rerun in range(self.config['browbeat']['rerun']):
for run in range(times):
self.update_tests()
if benchmark['type'] == "overcloud":
self.overcloud_workload(benchmark,
run,
results_dir,
env_setup,
conn)
elif benchmark['type'] == "introspection":
self.introspection_workload(benchmark,
run,
results_dir,
env_setup,
conn)
else:
self.logger.error("Could not identify YODA workload!")
exit(1)
self.update_scenarios()
else:
self.logger.info(
"Skipping {} benchmarks enabled: false".format(benchmark['name']))
else:
self.logger.error("Config file contains no yoda benchmarks.")

View File

@ -281,3 +281,86 @@ mapping:
file:
type: str
required: True
yoda:
required: False
type: map
allowempty: True
mapping:
enabled:
type: bool
required: True
instackenv:
type: str
required: True
stackrc:
type: str
required: True
venv:
type: str
required: True
benchmarks:
type: seq
required: True
sequence:
- type: map
mapping:
name:
type: str
required: True
type:
type: str
required: True
enabled:
required: True
type: bool
ntp_server:
type: str
required: False
templates:
type: seq
required: False
sequence:
- type: str
instackenv:
type: str
required: false
times:
type: int
required: True
step:
type: int
required: False
method:
type: str
required: False
timeout:
type: int
required: True
max_fail_amnt:
type: float
required: False
batch_size:
type: int
required: False
keep_stack:
type: bool
required: False
node_pinning:
type: bool
required: False
cloud:
type: seq
sequence:
- type: map
allowempty: True
mapping:
node:
type: str
required: True
start_scale:
type: int
required: True
end_scale:
type: int
required: True

View File

@ -3,3 +3,5 @@ elasticsearch
python-dateutil==2.4.2
python-openstackclient==3.11.0
pykwalify
elasticsearch
openstacksdk