fuel-qa/fuelweb_test/tests/tests_strength/test_failover.py

#    Copyright 2013 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

from proboscis import test

from fuelweb_test.helpers.decorators import log_snapshot_after_test
from fuelweb_test.tests.base_test_case import SetupEnvironment
from fuelweb_test.tests.tests_strength.test_failover_base\
    import TestHaFailoverBase


@test(groups=["ha", "neutron_failover", "ha_neutron_destructive"])
class TestHaNeutronFailover(TestHaFailoverBase):
    """TestHaNeutronFailover."""  # TODO documentation

    @property
    def snapshot_name(self):
        return "prepare_ha_neutron"

    @test(depends_on=[SetupEnvironment.prepare_slaves_5],
          groups=["deploy_ha", "prepare_ha_neutron", "neutron", "deployment"])
    @log_snapshot_after_test
    def prepare_ha_neutron(self):
        """Prepare cluster in HA/Neutron mode for failover tests

        Scenario:
            1. Create cluster
            2. Add 3 nodes with controller roles
            3. Add 2 nodes with compute roles
            4. Deploy the cluster
            5. Make snapshot

        Duration 70m
        Snapshot prepare_ha_neutron
        """
        super(self.__class__, self).deploy_ha()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_destroy_controllers", "ha_destroy_controllers"])
    @log_snapshot_after_test
    def ha_neutron_destroy_controllers(self):
        """Destroy two controllers and check pacemaker status is correct

        Scenario:
            1. Revert environment
            2. Destroy first controller
            3. Check pacemaker status
            4. Run OSTF
            5. Revert environment
            6. Destroy second controller
            7. Check pacemaker status
            8. Run OSTF

        Duration 35m
        """
        super(self.__class__, self).ha_destroy_controllers()

    @test(enabled=False,
          depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_disconnect_controllers",
                  "ha_disconnect_controllers"])
    @log_snapshot_after_test
    def ha_neutron_disconnect_controllers(self):
        """Disconnect controllers and check pacemaker status is correct

        Scenario:
            1. Block traffic on br-mgmt of the first controller
            2. Check pacemaker status
            3. Wait until MySQL Galera is UP on some controller
            4. Run OSTF

        Duration 15m
        """
        super(self.__class__, self).ha_disconnect_controllers()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_delete_vips", "ha_delete_vips"])
    @log_snapshot_after_test
    def ha_neutron_delete_vips(self):
        """Delete management and public VIPs 10 times.
        Verify that they are restored.
        Verify cluster by OSTF

        Scenario:
            1. Delete 10 time public and management VIPs
            2. Wait while it is being restored
            3. Verify it is restored
            4. Run OSTF

        Duration 30m
        """
        super(self.__class__, self).ha_delete_vips()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_mysql_termination", "ha_mysql_termination"])
    @log_snapshot_after_test
    def ha_neutron_mysql_termination(self):
        """Terminate mysql on all controllers one by one

        Scenario:
            1. Terminate mysql
            2. Wait while it is being restarted
            3. Verify it is restarted
            4. Go to another controller
            5. Run OSTF

        Duration 15m
        """
        super(self.__class__, self).ha_mysql_termination()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_haproxy_termination", "ha_haproxy_termination"])
    @log_snapshot_after_test
    def ha_neutron_haproxy_termination(self):
        """Terminate haproxy on all controllers one by one

        Scenario:
            1. Terminate haproxy
            2. Wait while it is being restarted
            3. Verify it is restarted
            4. Go to another controller
            5. Run OSTF

        Duration 25m
        """
        super(self.__class__, self).ha_haproxy_termination()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_pacemaker_configuration",
                  "ha_pacemaker_configuration"])
    @log_snapshot_after_test
    def ha_neutron_pacemaker_configuration(self):
        """Verify resources are configured

        Scenario:
            1. SSH to controller node
            2. Verify resources are configured
            3. Go to next controller

        Duration 15m
        """
        super(self.__class__, self).ha_pacemaker_configuration()

    @test(enabled=False, depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_pacemaker_restart_heat_engine",
                  "ha_pacemaker_restart_heat_engine"])
    @log_snapshot_after_test
    def ha_neutron_pacemaker_restart_heat_engine(self):
        """Verify heat engine service is restarted
         by pacemaker on amqp connection loss

        Scenario:
            1. SSH to any controller
            2. Check heat-engine status
            3. Block heat-engine amqp connections
            4. Check heat-engine was stopped on current controller
            5. Unblock heat-engine amqp connections
            6. Check heat-engine process is running with new pid
            7. Check amqp connection re-appears for heat-engine

        Duration 15m
        """
        super(self.__class__, self).ha_pacemaker_restart_heat_engine()

    @test(enabled=False, depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_check_monit", "ha_check_monit"])
    @log_snapshot_after_test
    def ha_neutron_check_monit(self):
        """Verify monit restarted nova
         service if it was killed

        Scenario:
            1. SSH to every compute node in cluster
            2. Kill nova-compute service
            3. Check service is restarted by monit

        Duration 25m
        """
        super(self.__class__, self).ha_check_monit()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_firewall"])
    @log_snapshot_after_test
    def ha_neutron_firewall(self):
        """Check firewall vulnerability on Neutron network

        Scenario:
            1. Start 'socat' on a cluster node to listen for a free random port
            2. Put to this port a string using 'nc' from admin node
            3. Check if the string appeared in the cluster node
            4. Repeat for each cluster node

        Duration 25m

        """
        super(self.__class__, self).check_firewall_vulnerability()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_virtual_router"])
    @log_snapshot_after_test
    def ha_neutron_virtual_router(self):
        """Verify connection is present and
        downloading maintained by conntrackd
        after primary controller destroy

        Scenario:
            1. SSH to compute node
            2. Check Internet connectivity
            3. Destroy primary controller
            4. Check Internet connectivity

        Duration 25m

        """
        super(self.__class__, self).check_virtual_router()

    @test(enabled=False, depends_on_groups=['prepare_ha_neutron'],
          groups=["check_neutron_package_loss"])
    @log_snapshot_after_test
    def ha_neutron_packages_loss(self):
        """Check cluster recovery if br-mgmt loss 5% packages

        Scenario:
            1. SSH to controller
            2. set 5 % package loss on br-mgmt
            3. run ostf

        Duration
        """
        # TODO enable test when fencing will be implements
        super(self.__class__, self).ha_controller_loss_packages()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_check_alive_rabbit"])
    @log_snapshot_after_test
    def ha_neutron_check_alive_rabbit(self):
        """Check alive rabbit node is not kicked from cluster
           when corosync service on node dies

        Scenario:
            1. SSH to first controller and put corosync cluster to
               maintenance mode:
               crm configure property maintenance-mode=true
            2. Stop corosync service on first controller
            3. Check on master node that rabbit-fence.log contains
               Ignoring alive node rabbit@node-1
            4. On second controller check that rabbitmq cluster_status
               contains all 3 nodes
            5. On first controller start corosync service and restart pacemaker
            6. Check that pcs status contains all 3 nodes

        Duration 25m

        """
        super(self.__class__, self).check_alive_rabbit_node_not_kicked()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_check_dead_rabbit"])
    @log_snapshot_after_test
    def ha_neutron_check_dead_rabbit(self):
        """Check dead rabbit node is kicked from cluster
           when corosync service on node dies

        Scenario:
            1. SSH to first controller and put corosync cluster to
               maintenance mode:
               crm configure property maintenance-mode=true
            2. Stop rabbit and corosync service on first controller
            3. Check on master node that rabbit-fence.log contains
               Disconnecting rabbit@node-1
            4. On second controller check that rabbitmq cluster_status
               contains only 2 nodes

        Duration 25m

        """
        super(self.__class__, self).check_dead_rabbit_node_kicked()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_neutron_3_1_rabbit_failover"])
    @log_snapshot_after_test
    def ha_neutron_test_3_1_rabbit_failover(self):
        """Check 3 in 1 rabbit failover

        Scenario:
            1. SSH to controller and get rabbit master
            2. Destroy not rabbit master node
            3. Check that rabbit master stay as was
            4. Run ostf ha
            5. Turn on destroyed slave
            6. Check rabbit master is the same
            7. Run ostf ha
            8. Destroy rabbit master node
            9. Check that new rabbit-master appears
            10. Run  ostf ha
            11. Power on destroyed node
            12. Check that new rabbit-master was not elected
            13. Run ostf ha

        Duration 25m

        """
        super(self.__class__, self).test_3_1_rabbit_failover()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=["ha_corosync_stability_check", "ha"])
    @log_snapshot_after_test
    def ha_corosync_stability_check(self):
        """Check after corosync failover that "pcs status nodes" reports
           the same DC with quorum, "offline"/"online" statuses

        Scenario:
            1. On the first controller kill corosync
            2. Verify for all controllers "pcs status nodes" reports
            3. Start corosync on the first controller
            4. Repeat steps 1-3 500 times

        Duration 120m

        """
        super(self.__class__, self).ha_corosync_stability_check()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=['change_pacemaker_parameter_does_not_break_rabbitmq'])
    @log_snapshot_after_test
    def change_pacemaker_parameter_not_break_rabbitmq(self):
        """Change pacemaker parameters doesn't break RabbitMQ.

        Scenario:
            1. Deploy environment with at least 3 controllers
            2. Change max_rabbitmqctl_timeouts parameter on one of
               controllers,after that slaves rabbitmq will be restarted by
               Pacemaker.
            3. Wait for 1 minute.
            4. Check RabbitMQ cluster is assembled until success in 10 min
            5. Run OSTF
            6. Repeat two more times steps 2-5

        Duration 50 min

        """
        super(self.__class__, self). \
            change_pacemaker_parameter_not_break_rabbitmq()

    @test(depends_on_groups=['prepare_ha_neutron'],
          groups=['ha_rabbitmq_stability_check'])
    @log_snapshot_after_test
    def ha_rabbitmq_stability_check(self):
        """Check after repeatable failover rabbit cluster is healthy.

        Scenario:
            1. Deploy environment with at least 3 controllers
               (Or revert existing snapshot)
            2. Wait for mysql cluster to become active
            3. Run ostf tests before destructive actions
            4. Get rabbit master node
            5. Move master rabbit resource to slave with pcs
            6. Delete pcs constraint for rabbit resource
            7. Assert HA services ready
            8. Get new rabbit master node
            9. Destroy it
            10. Assert HA services ready
            11. Run sanity and smoke OSTF sets
            12. Power on destroyed node
            13. Assert HA services ready
            14. Assert OS services ready
            15. Run OSTF

        Duration 80 min

        """
        super(self.__class__, self).ha_rabbitmq_stability_check()