rally/rally/task/engine.py

519 lines
20 KiB
Python

# Copyright 2013: Mirantis Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import copy
import json
import threading
import time
import traceback
from rally.common import cfg
from rally.common import logging
from rally.common import objects
from rally import consts
from rally import exceptions
from rally.task import context
from rally.task import hook
from rally.task import runner
from rally.task import scenario
from rally.task import sla
from rally.utils import strutils
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
TASK_ENGINE_OPTS = [
cfg.IntOpt("raw_result_chunk_size", default=1000, min=1,
help="Size of raw result chunk in iterations"),
]
class ResultConsumer(object):
"""ResultConsumer class stores results from ScenarioRunner, checks SLA.
Also ResultConsumer listens for runner events and notifies HookExecutor
about started iterations.
"""
def __init__(self, workload_cfg, task, subtask, workload, runner,
abort_on_sla_failure, ctx_manager):
"""ResultConsumer constructor.
:param workload_cfg: A configuration of the Workload
:param task: Instance of Task, task to run
:param subtask: Instance of Subtask
:param workload: Instance of Workload
:param runner: ScenarioRunner instance that produces results to be
consumed
:param abort_on_sla_failure: True if the execution should be stopped
when some SLA check fails
:param ctx_manager: ContextManager instance
"""
self.task = task
self.subtask = subtask
self.workload = workload
self.workload_cfg = workload_cfg
self.runner = runner
self.load_started_at = float("inf")
self.load_finished_at = 0
self.workload_data_count = 0
self.sla_checker = sla.SLAChecker(self.workload_cfg)
self.hook_executor = hook.HookExecutor(self.workload_cfg, self.task)
self.abort_on_sla_failure = abort_on_sla_failure
self.is_done = threading.Event()
self.unexpected_failure = {}
self.results = []
self.thread = threading.Thread(target=self._consume_results)
self.aborting_checker = threading.Thread(target=self.wait_and_abort)
if self.workload_cfg["hooks"]:
self.event_thread = threading.Thread(target=self._consume_events)
self._cm = ctx_manager
def __enter__(self):
self.thread.start()
self.aborting_checker.start()
if self.workload_cfg["hooks"]:
self.event_thread.start()
self.start = time.time()
return self
def _consume_results(self):
task_aborted = False
while True:
if self.runner.result_queue:
results = self.runner.result_queue.popleft()
self.results.extend(results)
for r in results:
self.load_started_at = min(r["timestamp"],
self.load_started_at)
self.load_finished_at = max(r["duration"] + r["timestamp"],
self.load_finished_at)
success = self.sla_checker.add_iteration(r)
if (self.abort_on_sla_failure and
not success and
not task_aborted):
self.sla_checker.set_aborted_on_sla()
self.runner.abort()
self.task.update_status(
consts.TaskStatus.SOFT_ABORTING)
task_aborted = True
# save results chunks
chunk_size = CONF.raw_result_chunk_size
while len(self.results) >= chunk_size:
results_chunk = self.results[:chunk_size]
self.results = self.results[chunk_size:]
results_chunk.sort(key=lambda x: x["timestamp"])
self.workload.add_workload_data(self.workload_data_count,
{"raw": results_chunk})
self.workload_data_count += 1
elif self.is_done.isSet():
break
else:
time.sleep(0.1)
def _consume_events(self):
while not self.is_done.isSet() or self.runner.event_queue:
if self.runner.event_queue:
event = self.runner.event_queue.popleft()
self.hook_executor.on_event(
event_type=event["type"], value=event["value"])
else:
time.sleep(0.01)
def __exit__(self, exc_type, exc_value, exc_traceback):
self.finish = time.time()
self.is_done.set()
self.aborting_checker.join()
self.thread.join()
if exc_type:
self.sla_checker.set_unexpected_failure(exc_value)
if objects.Task.get_status(
self.task["uuid"]) == consts.TaskStatus.ABORTED:
self.sla_checker.set_aborted_manually()
load_duration = max(self.load_finished_at - self.load_started_at, 0)
LOG.info("Load duration is: %s" % strutils.format_float_to_str(
load_duration))
LOG.info("Full runner duration is: %s" %
strutils.format_float_to_str(self.runner.run_duration))
LOG.info("Full duration is: %s" % strutils.format_float_to_str(
self.finish - self.start))
results = {}
if self.workload_cfg["hooks"]:
self.event_thread.join()
results["hooks_results"] = self.hook_executor.results()
if self.results:
# NOTE(boris-42): Sort in order of starting
# instead of order of ending
self.results.sort(key=lambda x: x["timestamp"])
self.workload.add_workload_data(self.workload_data_count,
{"raw": self.results})
start_time = (self.load_started_at
if self.load_started_at != float("inf") else None)
self.workload.set_results(load_duration=load_duration,
full_duration=(self.finish - self.start),
sla_results=self.sla_checker.results(),
start_time=start_time,
contexts_results=self._cm.contexts_results(),
**results)
@staticmethod
def is_task_in_aborting_status(task_uuid, check_soft=True):
"""Checks task is in abort stages
:param task_uuid: UUID of task to check status
:type task_uuid: str
:param check_soft: check or not SOFT_ABORTING status
:type check_soft: bool
"""
stages = [consts.TaskStatus.ABORTING, consts.TaskStatus.ABORTED]
if check_soft:
stages.append(consts.TaskStatus.SOFT_ABORTING)
return objects.Task.get_status(task_uuid) in stages
def wait_and_abort(self):
"""Waits until abort signal is received and aborts runner in this case.
Has to be run from different thread simultaneously with the
runner.run method.
"""
while not self.is_done.isSet():
if self.is_task_in_aborting_status(self.task["uuid"],
check_soft=False):
self.runner.abort()
self.task.update_status(consts.TaskStatus.ABORTED)
break
time.sleep(2.0)
class TaskAborted(Exception):
"""Task aborted exception
Used by TaskEngine to interrupt task run.
"""
class TaskEngine(object):
"""The Task engine class is used to execute benchmark scenarios.
An instance of this class is initialized by the API with the task
configuration and then is used to validate and execute all specified
in config subtasks.
.. note::
Typical usage:
...
engine = TaskEngine(config, task, env)
engine.validate() # to test config
engine.run() # to run config
"""
def __init__(self, config, task, env, abort_on_sla_failure=False):
"""TaskEngine constructor.
:param config: An instance of a rally.task.config.TaskConfig
:param task: Instance of Task,
the current task which is being performed
:param env: Instance of Environment,
:param abort_on_sla_failure: True if the execution should be stopped
when some SLA check fails
"""
self.config = config
self.task = task
self.env = env
self.abort_on_sla_failure = abort_on_sla_failure
def _validate_workload(self, workload, vcontext=None, vtype=None):
"""Validate a workload.
:param workload: a workload configuration
:param vcontext: a validation context
:param vtype: a type of validation (platform, syntax or semantic)
"""
scenario_cls = scenario.Scenario.get(workload["name"])
scenario_context = copy.deepcopy(scenario_cls.get_default_context())
results = []
results.extend(scenario.Scenario.validate(
name=workload["name"],
context=vcontext,
config=workload,
plugin_cfg=None,
vtype=vtype))
if workload["runner_type"]:
results.extend(runner.ScenarioRunner.validate(
name=workload["runner_type"],
context=vcontext,
config=None,
plugin_cfg=workload["runner"],
vtype=vtype))
for context_name, context_conf in workload["contexts"].items():
results.extend(context.Context.validate(
name=context_name,
context=vcontext,
config=None,
plugin_cfg=context_conf,
vtype=vtype))
for context_name, context_conf in scenario_context.items():
results.extend(context.Context.validate(
name=context_name,
context=vcontext,
config=None,
plugin_cfg=context_conf,
allow_hidden=True,
vtype=vtype))
for sla_name, sla_conf in workload["sla"].items():
results.extend(sla.SLA.validate(
name=sla_name,
context=vcontext,
config=None,
plugin_cfg=sla_conf,
vtype=vtype))
for hook_conf in workload["hooks"]:
action_name, action_cfg = hook_conf["action"]
results.extend(hook.HookAction.validate(
name=action_name,
context=vcontext,
config=None,
plugin_cfg=action_cfg,
vtype=vtype))
trigger_name, trigger_cfg = hook_conf["trigger"]
results.extend(hook.HookTrigger.validate(
name=trigger_name,
context=vcontext,
config=None,
plugin_cfg=trigger_cfg,
vtype=vtype))
if results:
msg = "\n ".join(results)
kw = {"name": workload["name"],
"pos": workload["position"],
"config": json.dumps(
objects.Workload.to_task(workload)),
"reason": msg}
raise exceptions.InvalidTaskConfig(**kw)
@logging.log_task_wrapper(LOG.info, "Task validation of syntax.")
def _validate_config_syntax(self, config):
for subtask in config.subtasks:
for workload in subtask["workloads"]:
self._validate_workload(workload, vtype="syntax")
@logging.log_task_wrapper(LOG.info,
"Task validation of required platforms.")
def _validate_config_platforms(self, config):
# FIXME(andreykurilin): prepare the similar context object to others
platforms = dict(
(p["platform_name"], p["platform_data"])
for p in self.env.data["platforms"].values())
ctx = {"task": self.task,
"platforms": platforms}
for subtask in config.subtasks:
for workload in subtask["workloads"]:
self._validate_workload(
workload, vcontext=ctx, vtype="platform")
@logging.log_task_wrapper(LOG.info, "Task validation of semantic.")
def _validate_config_semantic(self, config):
LOG.info("Check health of the environment '%s'." % self.env.uuid)
failed = []
for p, res in self.env.check_health().items():
LOG.info("Platform %s (available: %s): %s" %
(p, res["available"], res["message"]))
if not res["available"]:
failed.append(p)
if logging.is_debug():
LOG.error(res["traceback"])
if failed:
raise exceptions.ValidationError(
"One or several platforms are not available: %s. Check logs "
"for more details." % ", ".join(failed))
validation_ctx = self.env.get_validation_context()
env_data = self.env.data
env_data["platforms"] = dict(
(p["platform_name"], p["platform_data"])
for p in env_data["platforms"].values())
ctx_obj = {"task": self.task,
"config": validation_ctx,
"env": env_data}
with context.ContextManager(ctx_obj):
for subtask in config.subtasks:
for workload in subtask["workloads"]:
self._validate_workload(
workload, vcontext=ctx_obj, vtype="semantic")
@logging.log_task_wrapper(LOG.info, "Task validation.")
def validate(self, only_syntax=False):
"""Perform full task configuration validation.
:param only_syntax: Check only syntax of task configuration
"""
self.task.update_status(consts.TaskStatus.VALIDATING)
try:
self._validate_config_syntax(self.config)
if only_syntax:
return
self._validate_config_platforms(self.config)
self._validate_config_semantic(self.config)
except Exception as e:
exception_info = json.dumps(traceback.format_exc(), indent=2,
separators=(",", ": "))
self.task.set_failed(type(e).__name__, str(e), exception_info)
expected_errors = (
# this error is a wrapper for all error messages from
# validators.
exceptions.InvalidTaskConfig,
# rally.task.task_cfg raises it
# _validate_config_semantic raises this error in case of
# failed platform check{s}
exceptions.ValidationError)
if logging.is_debug() and not isinstance(e, expected_errors):
LOG.exception("Unexpected error had happened while validating "
"task.")
raise
def _prepare_context(self, ctx, scenario_name, owner_id):
context_config = {}
# restore full names of plugins
scenario_plugin = scenario.Scenario.get(scenario_name)
for k, v in scenario_plugin.get_default_context().items():
c = context.Context.get(k, allow_hidden=True)
context_config[c.get_fullname()] = v
for k, v in ctx.items():
context_config[context.Context.get(k).get_fullname()] = v
env_data = self.env.data
env_data["platforms"] = dict(
(p["platform_name"], p["platform_data"])
for p in env_data["platforms"].values())
context_obj = {
"task": self.task,
"owner_id": owner_id,
"scenario_name": scenario_name,
"config": context_config,
"env": env_data
}
return context_obj
@logging.log_task_wrapper(LOG.info, "Running task.")
def run(self):
"""Run the benchmark according to the test configuration.
Test configuration is specified on engine initialization.
:returns: List of dicts, each dict containing the results of all the
corresponding benchmark test launches
"""
self.task.update_status(consts.TaskStatus.RUNNING)
try:
for subtask in self.config.subtasks:
self._run_subtask(subtask)
except TaskAborted:
LOG.info("Received aborting signal.")
self.task.update_status(consts.TaskStatus.ABORTED)
else:
if objects.Task.get_status(
self.task["uuid"]) != consts.TaskStatus.ABORTED:
self.task.update_status(consts.TaskStatus.FINISHED)
def _run_subtask(self, subtask):
subtask_obj = self.task.add_subtask(title=subtask["title"],
description=subtask["description"],
contexts=subtask["contexts"])
try:
# TODO(astudenov): add subtask context here
for workload in subtask["workloads"]:
self._run_workload(subtask_obj, workload)
except TaskAborted:
subtask_obj.update_status(consts.SubtaskStatus.ABORTED)
raise
except Exception:
subtask_obj.update_status(consts.SubtaskStatus.CRASHED)
# TODO(astudenov): save error to DB
LOG.exception("Unexpected exception during the subtask execution")
# NOTE(astudenov): crash task after exception in subtask
self.task.update_status(consts.TaskStatus.CRASHED)
raise
else:
subtask_obj.update_status(consts.SubtaskStatus.FINISHED)
def _run_workload(self, subtask_obj, workload):
if ResultConsumer.is_task_in_aborting_status(self.task["uuid"]):
raise TaskAborted()
workload_obj = subtask_obj.add_workload(
name=workload["name"],
description=workload["description"],
position=workload["position"],
runner=workload["runner"],
runner_type=workload["runner_type"],
hooks=workload["hooks"],
contexts=workload["contexts"],
sla=workload["sla"],
args=workload["args"])
workload["uuid"] = workload_obj["uuid"]
workload_cfg = objects.Workload.to_task(workload)
LOG.info("Running workload: \n"
" position = %(position)s\n"
" config = %(cfg)s"
% {"position": workload["position"],
"cfg": json.dumps(workload_cfg, indent=3)})
runner_cls = runner.ScenarioRunner.get(workload["runner_type"])
runner_obj = runner_cls(self.task, workload["runner"])
context_obj = self._prepare_context(
workload["contexts"], workload["name"], workload_obj["uuid"])
try:
ctx_manager = context.ContextManager(context_obj)
with ResultConsumer(workload, task=self.task, subtask=subtask_obj,
workload=workload_obj, runner=runner_obj,
abort_on_sla_failure=self.abort_on_sla_failure,
ctx_manager=ctx_manager):
with ctx_manager:
runner_obj.run(workload["name"], context_obj,
workload["args"])
except Exception:
LOG.exception("Unexpected exception during the workload execution")
# TODO(astudenov): save error to DB