Refactor Quorum to dedup common code in wait_join and run

Change-Id: Iaffc4b7ef500d3e17d5ef0daed07854d29436537
2015-04-07 16:09:24 +03:00 · 2015-04-07 16:09:24 +03:00 · 1a5108fb3c
commit 1a5108fb3c
parent ff11cd9e04
7 changed files with 190 additions and 88 deletions
--- a/doc/source/tools/shaker.txt
+++ b/doc/source/tools/shaker.txt
@ -1,11 +1,12 @@
-usage: shaker [-h] [--agent-loss-timeout AGENT_LOSS_TIMEOUT]
-              [--config-dir DIR] [--config-file PATH] [--debug]
-              [--external-net EXTERNAL_NET] [--flavor-name FLAVOR_NAME]
-              [--image-name IMAGE_NAME] [--log-config-append PATH]
-              [--log-date-format DATE_FORMAT] [--log-dir LOG_DIR]
-              [--log-file PATH] [--log-format FORMAT] [--nodebug]
-              [--nouse-syslog] [--nouse-syslog-rfc-format] [--noverbose]
-              [--os-auth-url <auth-url>] [--os-password <auth-password>]
+usage: shaker [-h] [--agent-join-timeout AGENT_JOIN_TIMEOUT]
+              [--agent-loss-timeout AGENT_LOSS_TIMEOUT] [--config-dir DIR]
+              [--config-file PATH] [--debug] [--external-net EXTERNAL_NET]
+              [--flavor-name FLAVOR_NAME] [--image-name IMAGE_NAME]
+              [--log-config-append PATH] [--log-date-format DATE_FORMAT]
+              [--log-dir LOG_DIR] [--log-file PATH] [--log-format FORMAT]
+              [--nodebug] [--nouse-syslog] [--nouse-syslog-rfc-format]
+              [--noverbose] [--os-auth-url <auth-url>]
+              [--os-password <auth-password>]
              [--os-region-name <auth-region-name>]
              [--os-tenant-name <auth-tenant-name>]
              [--os-username <auth-username>] [--output OUTPUT]
@ -17,8 +18,12 @@ usage: shaker [-h] [--agent-loss-timeout AGENT_LOSS_TIMEOUT]

 optional arguments:
  -h, --help            show this help message and exit
+  --agent-join-timeout AGENT_JOIN_TIMEOUT
+                        How long to wait for agents to join in seconds (time
+                        between stack deployment and start of scenario
+                        execution).
  --agent-loss-timeout AGENT_LOSS_TIMEOUT
-                        Timeout to treat agent as lost
+                        Timeout to treat agent as lost in seconds
  --config-dir DIR      Path to a config directory to pull *.conf files from.
                        This file set is sorted, so as to provide a
                        predictable parse order if individual options are
--- a/etc/shaker.conf
+++ b/etc/shaker.conf
@ -129,9 +129,13 @@
 # value)
 #output = <None>

-# Timeout to treat agent as lost (integer value)
+# Timeout to treat agent as lost in seconds (integer value)
 #agent_loss_timeout = 60

+# How long to wait for agents to join in seconds (time between stack deployment
+# and start of scenario execution). (integer value)
+#agent_join_timeout = 600
+
 # Report template in Jinja format (string value)
 #report_template = shaker/resources/report_template.jinja2

--- a/shaker/agent/agent.py
+++ b/shaker/agent/agent.py
@ -110,6 +110,7 @@ def main():
            elif task['operation'] == 'configure':
                if 'polling_interval' in task:
                    polling_interval = task.get('polling_interval')
+                    send_reply(socket, agent_id, {})

            time.sleep(polling_interval)

--- a/shaker/engine/config.py
+++ b/shaker/engine/config.py
@ -81,7 +81,12 @@ SERVER_OPTS = [
                    'defaults to env[SHAKER_OUTPUT].'),
    cfg.IntOpt('agent-loss-timeout',
               default=60,
-               help='Timeout to treat agent as lost in seconds')
+               help='Timeout to treat agent as lost in seconds'),
+    cfg.IntOpt('agent-join-timeout',
+               default=600,
+               help='How long to wait for agents to join in seconds (time '
+                    'between stack deployment and start of scenario '
+                    'execution).')
 ]

 REPORT_OPTS = [
--- a/shaker/engine/server.py
+++ b/shaker/engine/server.py
@ -34,94 +34,138 @@ from shaker.engine import utils
 LOG = logging.getLogger(__name__)


+class BaseOperation(object):
+    def get_agent_join_timeout(self):
+        return 0
+
+    def get_active_agent_ids(self):
+        pass
+
+    def get_reply(self, agent_id, start_at):
+        return {}
+
+    def process_reply(self, agent_id, message):
+        return {'status': 'ok'}
+
+    def process_failure(self, agent_id):
+        return {'status': 'lost'}
+
+
+class JoinOperation(BaseOperation):
+    def __init__(self, agent_ids, polling_interval, agent_join_timeout):
+        super(JoinOperation, self).__init__()
+        self.agent_ids = agent_ids
+        self.polling_interval = polling_interval
+        self.agent_join_timeout = agent_join_timeout
+
+    def get_agent_join_timeout(self):
+        return self.agent_join_timeout
+
+    def get_active_agent_ids(self):
+        return set(self.agent_ids)
+
+    def get_reply(self, agent_id, start_at):
+        return dict(operation='configure',
+                    polling_interval=self.polling_interval,
+                    expected_duration=0)
+
+
+class ExecuteOperation(BaseOperation):
+    def __init__(self, executors):
+        super(ExecuteOperation, self).__init__()
+        self.executors = executors
+
+    def get_active_agent_ids(self):
+        return set(self.executors.keys())
+
+    def get_reply(self, agent_id, start_at):
+        reply = dict(operation='execute',
+                     start_at=start_at,
+                     command=self.executors[agent_id].get_command(),
+                     expected_duration=(self.executors[agent_id].
+                                        get_expected_duration()))
+        return reply
+
+    def process_reply(self, agent_id, message):
+        r = super(ExecuteOperation, self).process_reply(agent_id, message)
+        r.update(self.executors[agent_id].process_reply(message))
+        return r
+
+    def process_failure(self, agent_id):
+        r = super(ExecuteOperation, self).process_failure(agent_id)
+        r.update(self.executors[agent_id].process_failure())
+        return r
+
+
 class Quorum(object):
-    def __init__(self, message_queue, polling_interval, agent_loss_timeout):
+    def __init__(self, message_queue, polling_interval, agent_loss_timeout,
+                 agent_join_timeout):
        self.message_queue = message_queue
        self.polling_interval = polling_interval
        self.agent_loss_timeout = agent_loss_timeout
+        self.agent_join_timeout = agent_join_timeout

-    def wait_join(self, agent_ids):
-        agent_ids = set(agent_ids)
-        LOG.debug('Waiting for quorum of agents: %s', agent_ids)
-        alive_agents = set()
-        for message, reply_handler in self.message_queue:
-            msg_agent_id = message.get('agent_id')
-
-            if msg_agent_id not in agent_ids:
-                reply_handler(dict(operation='none'))
-                continue
-
-            alive_agents.add(msg_agent_id)
-
-            reply_handler(dict(operation='configure',
-                               polling_interval=self.polling_interval))
-
-            LOG.debug('Alive agents: %s', alive_agents)
-
-            if alive_agents == agent_ids:
-                LOG.info('All expected agents are alive')
-                break
-
-    def run_test_case(self, test_case):
-        current = set(test_case.keys())
-        LOG.debug('Running test case on agents: %s', current)
+    def _run(self, operation):
+        current = operation.get_active_agent_ids()
+        LOG.debug('Executing operation %s on agents: %s', operation, current)

        working = set()
        replied = set()
        result = {}

        start_at = time.time() + self.polling_interval * 2
-        lives = dict((agent_id, start_at) for agent_id in current)
+        lives = dict((agent_id, start_at + operation.get_agent_join_timeout())
+                     for agent_id in current)

        for message, reply_handler in self.message_queue:
            agent_id = message.get('agent_id')
-            operation = message.get('operation')
-
-            now = time.time()
-            lives[agent_id] = (now + self.polling_interval * 2 +
-                               self.agent_loss_timeout)
-
+            op = message.get('operation')
            reply = {'operation': 'none'}
+            now = time.time()

-            if agent_id in current:
-                # message from a known agent
-                test = test_case[agent_id]
+            if agent_id in (current - replied):
+                # message from a known not yet worked agent
+                lives[agent_id] = (now + self.polling_interval * 2 +
+                                   self.agent_loss_timeout)

-                if operation == 'poll':
-                    reply = {
-                        'operation': 'execute',
-                        'start_at': start_at,
-                        'command': test.get_command(),
-                    }
+                if op == 'poll':
+                    reply = operation.get_reply(agent_id, start_at)
+                    lives[agent_id] += reply.get('expected_duration')
                    working.add(agent_id)
-                    lives[agent_id] += test.get_expected_duration()
                    LOG.debug('Working agents: %s', working)
-                elif operation == 'reply':
-                    replied.add(agent_id)
-                    result[agent_id] = test.process_reply(message)
-                    result[agent_id].update(dict(status='ok', time=now))
-                    LOG.debug('Replied agents: %s', replied)
+                elif op == 'reply':
+                    if agent_id in working:
+                        result[agent_id] = operation.process_reply(
+                            agent_id, message)
+                        replied.add(agent_id)
+                        LOG.debug('Replied agents: %s', replied)

            reply_handler(reply)

-            lost = set(a for a, t in lives.items() if t < now)
+            lost = set(a for a, t in lives.items() if t < now) - replied
            if lost:
                LOG.debug('Lost agents: %s', lost)

            if replied | lost >= current:
-                # update result with info about lost agents
-                for agent_id in lost:
-                    if agent_id not in replied and agent_id in current:
-                        result[agent_id] = (
-                            test_case[agent_id].process_failure())
-                        result[agent_id].update(dict(status='lost', time=now))
+                if lost:
+                    LOG.warning('Lost agents: %s', lost)
+                    # update result with info about lost agents
+                    for agent_id in lost:
+                        result[agent_id] = operation.process_failure(agent_id)

-                LOG.info('Received replies from all alive agents for '
-                         'test case: %s', test_case)
+                LOG.info('Finished processing operation: %s', operation)
                break

        return result

+    def join(self, agent_ids):
+        LOG.debug('Waiting for quorum of agents: %s', agent_ids)
+        return self._run(JoinOperation(agent_ids, self.polling_interval,
+                                       self.agent_join_timeout))
+
+    def execute(self, executors):
+        return self._run(ExecuteOperation(executors))
+

 def read_scenario():
    scenario_raw = utils.read_file(cfg.CONF.scenario)
@ -176,8 +220,9 @@ def execute(quorum, execution, agents):
            executors = dict((a['id'], executors_classes.get_executor(test, a))
                             for a in selected_agents)

-            test_case_result = quorum.run_test_case(executors)
-            values = test_case_result.values()
+            execution_result = quorum.execute(executors)
+
+            values = execution_result.values()
            for v in values:
                v['uuid'] = str(uuid.uuid4())
            results_per_iteration.append({
@ -228,8 +273,9 @@ def main():
            message_queue = messaging.MessageQueue(cfg.CONF.server_endpoint)

            quorum = Quorum(message_queue, cfg.CONF.polling_interval,
-                            cfg.CONF.agent_loss_timeout)
-            quorum.wait_join(set(agents.keys()))
+                            cfg.CONF.agent_loss_timeout,
+                            cfg.CONF.agent_join_timeout)
+            quorum.join(set(agents.keys()))

            result = execute(quorum, scenario['execution'], agents)
            LOG.debug('Result: %s', result)
--- a/shaker/lib.py
+++ b/shaker/lib.py
@ -33,7 +33,7 @@ class Shaker(object):
        message_queue = messaging.MessageQueue(server_endpoint)
        self.quorum = server.Quorum(message_queue, polling_interval,
                                    agent_loss_timeout)
-        self.quorum.wait_join(agent_ids)
+        self.quorum.join(agent_ids)

    def _run(self, agent_id, item):
        agents = dict([(agent_id, dict(id=agent_id, mode='alone'))])
--- a/tests/test_quorum.py
+++ b/tests/test_quorum.py
@ -24,9 +24,11 @@ from shaker.engine import server

 STEP = 10  # polling interval
 LOSS_TIMEOUT = 60
+JOIN_TIMEOUT = 600

 make_quorum = functools.partial(server.Quorum, polling_interval=STEP,
-                                agent_loss_timeout=LOSS_TIMEOUT)
+                                agent_loss_timeout=LOSS_TIMEOUT,
+                                agent_join_timeout=JOIN_TIMEOUT)


 class DummyExecutor(base_executor.BaseExecutor):
@ -74,7 +76,7 @@ class TestQuorum(testtools.TestCase):
        event_stream = [
            dict(msg=dict(operation='poll', agent_id='alpha'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP),
                 time=1),
            dict(msg=dict(operation='reply', agent_id='alpha'),
                 reply=dict(operation='none'),
@ -85,7 +87,7 @@ class TestQuorum(testtools.TestCase):
        test_case = {
            'alpha': DummyExecutor()
        }
-        result = quorum.run_test_case(test_case)
+        result = quorum.execute(test_case)
        self.assertEqual(result.keys(), test_case.keys())

    def test_poll_reply_unknown_agent_ignored(self):
@ -93,7 +95,7 @@ class TestQuorum(testtools.TestCase):
        event_stream = [
            dict(msg=dict(operation='poll', agent_id='alpha'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP),
                 time=1),
            dict(msg=dict(operation='reply', agent_id='beta'),
                 reply=dict(operation='none'),
@ -107,7 +109,7 @@ class TestQuorum(testtools.TestCase):
        test_case = {
            'alpha': DummyExecutor()
        }
-        result = quorum.run_test_case(test_case)
+        result = quorum.execute(test_case)
        self.assertEqual(result.keys(), test_case.keys())

    def test_lost_agent(self):
@ -115,7 +117,7 @@ class TestQuorum(testtools.TestCase):
        event_stream = [
            dict(msg=dict(operation='poll', agent_id='alpha'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP),
                 time=1),
            dict(msg=dict(operation='reply', agent_id='heartbeat'),
                 reply=dict(operation='none'),
@ -126,7 +128,7 @@ class TestQuorum(testtools.TestCase):
        test_case = {
            'alpha': DummyExecutor()
        }
-        result = quorum.run_test_case(test_case)
+        result = quorum.execute(test_case)
        self.assertEqual(result.keys(), test_case.keys())
        self.assertEqual('lost', result['alpha']['status'])

@ -136,7 +138,7 @@ class TestQuorum(testtools.TestCase):
        event_stream = [
            dict(msg=dict(operation='poll', agent_id='alpha'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP),
                 time=1),
            dict(msg=dict(operation='reply', agent_id='heartbeat'),
                 reply=dict(operation='none'),
@ -150,7 +152,7 @@ class TestQuorum(testtools.TestCase):
        test_case = {
            'alpha': DummyExecutor()
        }
-        result = quorum.run_test_case(test_case)
+        result = quorum.execute(test_case)
        self.assertEqual(result.keys(), test_case.keys())
        self.assertEqual('ok', result['alpha']['status'])

@ -159,11 +161,11 @@ class TestQuorum(testtools.TestCase):
        event_stream = [
            dict(msg=dict(operation='poll', agent_id='alpha'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP),
                 time=1),
            dict(msg=dict(operation='poll', agent_id='beta'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP),
                 time=2),
            dict(msg=dict(operation='reply', agent_id='beta'),
                 reply=dict(operation='none'),
@ -178,17 +180,17 @@ class TestQuorum(testtools.TestCase):
            'alpha': DummyExecutor(),
            'beta': DummyExecutor(),
        }
-        result = quorum.run_test_case(test_case)
+        result = quorum.execute(test_case)
        self.assertEqual(result.keys(), test_case.keys())
        self.assertEqual('lost', result['alpha']['status'])
        self.assertEqual('ok', result['beta']['status'])

-    def test_wait_agent_running_long_test(self):
+    def test_wait_agentexecutening_long_test(self):
        self.mock_time.return_value = 0
        event_stream = [
            dict(msg=dict(operation='poll', agent_id='alpha'),
                 reply=dict(operation='execute', command='RUN',
-                            start_at=STEP * 2),
+                            start_at=STEP * 2, expected_duration=STEP * 9),
                 time=1),
            dict(msg=dict(operation='reply', agent_id='heartbeat'),
                 reply=dict(operation='none'),
@ -202,6 +204,45 @@ class TestQuorum(testtools.TestCase):
        test_case = {
            'alpha': DummyExecutor(duration=STEP * 9)
        }
-        result = quorum.run_test_case(test_case)
+        result = quorum.execute(test_case)
        self.assertEqual(result.keys(), test_case.keys())
        self.assertEqual('ok', result['alpha']['status'])
+
+    def test_join_succeed(self):
+        self.mock_time.return_value = 0
+        event_stream = [
+            dict(msg=dict(operation='poll', agent_id='alpha'),
+                 reply=dict(operation='configure', polling_interval=STEP,
+                            expected_duration=0),
+                 time=STEP),
+            dict(msg=dict(operation='reply', agent_id='alpha'),
+                 reply=dict(operation='none'),
+                 time=STEP * 2),
+            dict(msg=dict(operation='reply', agent_id='heartbeat'),
+                 reply=dict(operation='none'),
+                 time=STEP * 2),
+        ]
+
+        quorum = make_quorum(self._message_queue_gen(event_stream))
+        result = quorum.join(['alpha'])
+        lost = [agent_id for agent_id, r in result.items()
+                if r['status'] == 'lost']
+        self.assertEqual([], lost)
+
+    def test_join_failed(self):
+        self.mock_time.return_value = 0
+        event_stream = [
+            dict(msg=dict(operation='poll', agent_id='alpha'),
+                 reply=dict(operation='configure', polling_interval=STEP,
+                            expected_duration=0),
+                 time=STEP),
+            dict(msg=dict(operation='reply', agent_id='heartbeat'),
+                 reply=dict(operation='none'),
+                 time=JOIN_TIMEOUT + STEP * 2),
+        ]
+
+        quorum = make_quorum(self._message_queue_gen(event_stream))
+        result = quorum.join(['alpha'])
+        lost = [agent_id for agent_id, r in result.items()
+                if r['status'] == 'lost']
+        self.assertEqual(['alpha'], lost)