kloudbuster/kloudbuster/kb_runner.py

# Copyright 2015 Cisco Systems, Inc.  All rights reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

from __future__ import division
from collections import deque
from distutils.version import LooseVersion
import threading
import time

import log as logging
import redis

# A set of warned VM version mismatches
vm_version_mismatches = set()

LOG = logging.getLogger(__name__)

class KBVMUpException(Exception):
    pass

class KBSetStaticRouteException(Exception):
    pass

class KBHTTPServerUpException(Exception):
    pass

class KBHTTPBenchException(Exception):
    pass

class KBProxyConnectionException(Exception):
    pass

class KBRunner(object):
    """
    Control the testing VMs on the testing cloud
    """

    def __init__(self, client_list, config, expected_agent_version, single_cloud=True):
        self.full_client_dict = dict(zip([x.vm_name for x in client_list], client_list))
        self.client_dict = self.full_client_dict
        self.config = config
        self.single_cloud = single_cloud
        self.result = {}
        self.host_stats = {}
        self.tool_result = {}
        self.expected_agent_version = expected_agent_version
        self.agent_version = None
        self.report = {'seq': 0, 'report': None}

        # Redis
        self.redis_obj = None
        self.pubsub = None
        self.orches_chan_name = "kloudbuster_orches"
        self.report_chan_name = "kloudbuster_report"
        self.message_queue = deque()

    def header_formatter(self, stage, vm_count):
        conns = vm_count * self.config.http_tool_configs.connections
        rate_limit = vm_count * self.config.http_tool_configs.rate_limit
        msg = "Stage %d: %d VM(s), %d Connections, %d Expected RPS" %\
              (stage, vm_count, conns, rate_limit)
        return msg

    def msg_handler(self):
        for message in self.pubsub.listen():
            if message['data'] == "STOP":
                break
            LOG.kbdebug(message)
            self.message_queue.append(message)

    def setup_redis(self, redis_server, redis_server_port=6379, timeout=120):
        LOG.info("Setting up the redis connections...")
        connection_pool = redis.ConnectionPool(
            host=redis_server, port=redis_server_port, db=0)

        self.redis_obj = redis.StrictRedis(connection_pool=connection_pool,
                                           socket_connect_timeout=1,
                                           socket_timeout=1)
        success = False
        retry_count = max(timeout // self.config.polling_interval, 1)
        # Check for connections to redis server
        for retry in xrange(retry_count):
            try:
                self.redis_obj.get("test")
                success = True
            except (redis.exceptions.ConnectionError):
                LOG.info("Connecting to redis server... Retry #%d/%d", retry, retry_count)
                time.sleep(self.config.polling_interval)
                continue
            break
        if not success:
            LOG.error("Error: Cannot connect to the Redis server")
            raise KBProxyConnectionException()

        # Subscribe to message channel
        self.pubsub = self.redis_obj.pubsub(ignore_subscribe_messages=True)
        self.pubsub.subscribe(self.report_chan_name)
        self.msg_thread = threading.Thread(target=self.msg_handler)
        self.msg_thread.daemon = True
        self.msg_thread.start()

    def dispose(self):
        self.redis_obj.publish(self.report_chan_name, "STOP")
        self.msg_thread.join()
        if self.pubsub:
            self.pubsub.unsubscribe()
            self.pubsub.close()

    def send_cmd(self, cmd, client_type, data):
        message = {'cmd': cmd, 'sender-id': 'kb-master',
                   'client-type': client_type, 'data': data}
        LOG.kbdebug(message)
        self.redis_obj.publish(self.orches_chan_name, message)

    def polling_vms(self, timeout, polling_interval=None):
        '''
        Polling all VMs for the status of execution
        Guarantee to run once if the timeout is less than polling_interval
        '''
        if not polling_interval:
            polling_interval = self.config.polling_interval
        retry_count = max(timeout // polling_interval, 1)
        retry = cnt_succ = cnt_failed = 0
        clist = self.client_dict.copy()
        samples = []
        http_tool = self.client_dict.values()[0].http_tool

        while (retry < retry_count and len(clist)):
            time.sleep(polling_interval)
            sample_count = 0
            while True:
                try:
                    msg = self.message_queue.popleft()
                except IndexError:
                    # No new message, commands are in executing
                    break

                payload = eval(msg['data'])
                vm_name = payload['sender-id']
                cmd = payload['cmd']
                if cmd == 'READY':
                    # If a READY packet is received, the corresponding VM is up
                    # running. We mark the flag for that VM, and skip all READY
                    # messages received afterwards.
                    instance = self.full_client_dict[vm_name]
                    if instance.up_flag:
                        continue
                    else:
                        clist[vm_name].up_flag = True
                        clist.pop(vm_name)
                        cnt_succ = cnt_succ + 1
                        self.agent_version = payload['data']
                elif cmd == 'REPORT':
                    sample_count = sample_count + 1
                    # Parse the results from HTTP Tools
                    instance = self.client_dict[vm_name]
                    self.result[vm_name] = instance.http_client_parser(**payload['data'])
                    samples.append(self.result[vm_name])
                elif cmd == 'DONE':
                    self.result[vm_name] = payload['data']
                    clist.pop(vm_name)
                    if self.result[vm_name]['status']:
                        # Command returned with non-zero status, command failed
                        LOG.error("[%s] %s", vm_name, self.result[vm_name]['stderr'])
                        cnt_failed = cnt_failed + 1
                    else:
                        # Command returned with zero, command succeed
                        cnt_succ = cnt_succ + 1
                elif cmd == 'DEBUG':
                    LOG.info('[%s] %s' + (vm_name, payload['data']))
                else:
                    LOG.error('[%s] received invalid command: %s' + (vm_name, cmd))

            log_msg = "%d Succeed, %d Failed, %d Pending... Retry #%d" %\
                      (cnt_succ, cnt_failed, len(clist), retry)
            if sample_count != 0:
                log_msg += " (%d sample(s) received)" % sample_count
            LOG.info(log_msg)

            if sample_count != 0:
                report = http_tool.consolidate_samples(samples, len(self.client_dict))
                self.report['seq'] = self.report['seq'] + 1
                self.report['report'] = report
                LOG.info('Periodical report: %s.' % str(self.report))
                samples = []
            retry = retry + 1

        return (cnt_succ, cnt_failed, len(clist))

    def wait_for_vm_up(self, timeout=300):
        cnt_succ = self.polling_vms(timeout)[0]
        if cnt_succ != len(self.client_dict):
            raise KBVMUpException()
        self.send_cmd('ACK', None, None)

    def setup_static_route(self, active_range, timeout=30):
        func = {'cmd': 'setup_static_route', 'active_range': active_range}
        self.send_cmd('EXEC', 'http', func)
        cnt_succ = self.polling_vms(timeout)[0]
        if cnt_succ != len(self.client_dict):
            raise KBSetStaticRouteException()

    def check_http_service(self, active_range, timeout=30):
        func = {'cmd': 'check_http_service', 'active_range': active_range}
        self.send_cmd('EXEC', 'http', func)
        cnt_succ = self.polling_vms(timeout)[0]
        if cnt_succ != len(self.client_dict):
            raise KBHTTPServerUpException()

    def run_http_test(self, active_range):
        func = {'cmd': 'run_http_test', 'active_range': active_range,
                'parameter': dict(self.config.http_tool_configs)}
        self.send_cmd('EXEC', 'http', func)
        # Give additional 30 seconds for everybody to report results
        timeout = self.config.http_tool_configs.duration + 30
        cnt_pending = self.polling_vms(timeout)[2]
        if cnt_pending != 0:
            LOG.warn("Testing VMs are not returning results within grace period, "
                     "summary shown below may not be accurate!")

        # Parse the results from HTTP Tools
        for key, instance in self.client_dict.items():
            self.result[key] = instance.http_client_parser(**self.result[key])

    def gen_host_stats(self):
        self.host_stats = {}
        for vm in self.result.keys():
            phy_host = self.client_dict[vm].host
            if phy_host not in self.host_stats:
                self.host_stats[phy_host] = []
            self.host_stats[phy_host].append(self.result[vm])

        http_tool = self.client_dict.values()[0].http_tool
        for phy_host in self.host_stats:
            self.host_stats[phy_host] = http_tool.consolidate_results(self.host_stats[phy_host])

    def single_run(self, active_range=None):
        try:
            if self.single_cloud:
                LOG.info("Setting up static route to reach tested cloud...")
                self.setup_static_route(active_range)

            LOG.info("Waiting for HTTP service to come up...")
            self.check_http_service(active_range)

            if self.config.prompt_before_run:
                print "Press enter to start running benchmarking tools..."
                raw_input()

            LOG.info("Running HTTP Benchmarking...")
            self.run_http_test(active_range)

            # Call the method in corresponding tools to consolidate results
            http_tool = self.client_dict.values()[0].http_tool
            LOG.kbdebug(self.result.values())
            self.tool_result = http_tool.consolidate_results(self.result.values())
            self.tool_result['http_rate_limit'] =\
                len(self.client_dict) * self.config.http_tool_configs.rate_limit
            self.tool_result['total_connections'] =\
                len(self.client_dict) * self.config.http_tool_configs.connections
            self.tool_result['total_client_vms'] = len(self.full_client_dict)
            self.tool_result['total_server_vms'] = len(self.full_client_dict)
            # self.tool_result['host_stats'] = self.gen_host_stats()
        except KBSetStaticRouteException:
            LOG.error("Could not set static route.")
            self.dispose()
            return False
        except KBHTTPServerUpException:
            LOG.error("HTTP service is not up in testing cloud.")
            self.dispose()
            return False
        except KBHTTPBenchException:
            LOG.error("Error while running HTTP benchmarking tool.")
            self.dispose()
            return False

        return True

    def run(self):
        try:
            LOG.info("Waiting for agents on VMs to come up...")
            self.wait_for_vm_up()
            if not self.agent_version:
                self.agent_version = "0"
            if (LooseVersion(self.agent_version) != LooseVersion(self.expected_agent_version)):
                # only warn once for each unexpected VM version
                if self.expected_agent_version not in vm_version_mismatches:
                    vm_version_mismatches.add(self.expected_agent_version)
                    LOG.warn("The VM image you are running (%s) is not the expected version (%s) "
                             "this may cause some incompatibilities" %
                             (self.agent_version, self.expected_agent_version))
        except KBVMUpException:
            LOG.error("Some VMs failed to start.")
            self.dispose()
            return

        if self.config.progression.enabled:
            start = self.config.progression.vm_start
            step = self.config.progression.vm_step
            limit = self.config.progression.stop_limit
            timeout = self.config.http_tool_configs.timeout
            vm_list = self.full_client_dict.keys()
            vm_list.sort()

            self.client_dict = {}
            cur_stage = 1

            while True:
                cur_vm_count = len(self.client_dict)
                target_vm_count = start + (cur_stage - 1) * step
                if target_vm_count > len(self.full_client_dict):
                    break
                if self.tool_result:
                    err = self.tool_result['http_sock_err'] / self.tool_result['http_total_req']
                    pert_dict = dict(self.tool_result['latency_stats'])
                    if limit[1] in pert_dict.keys():
                        timeout_at_percentile = pert_dict[limit[1]] // 1000000
                    else:
                        timeout_at_percentile = 0
                        LOG.warn('Percentile %s%% is not a standard statistic point.' % limit[1])
                    if err > limit[0] or timeout_at_percentile > timeout:
                        LOG.warn('KloudBuster is stopping the iteration because the result '
                                 'reaches the stop limit.')
                        break

                for idx in xrange(cur_vm_count, target_vm_count):
                    self.client_dict[vm_list[idx]] = self.full_client_dict[vm_list[idx]]
                description = "-- %s --" % self.header_formatter(cur_stage, len(self.client_dict))
                LOG.info(description)
                if not self.single_run(active_range=[0, target_vm_count - 1]):
                    break
                LOG.info('-- Stage %s: %s --' % (cur_stage, str(self.tool_result)))
                self.tool_result['description'] = description
                cur_stage += 1
                yield self.tool_result

            self.dispose()
        else:
            if self.single_run():
                yield self.tool_result
            self.dispose()