diff --git a/etc/libra.cfg b/etc/libra.cfg index e275823b..1ad24e18 100644 --- a/etc/libra.cfg +++ b/etc/libra.cfg @@ -140,6 +140,12 @@ pid = /var/run/libra/libra_admin_api.pid #stats_enable = False #stats_purge_enable = False +# These are the number of instances to save for forensic analysis on failure +# A value of 0 disables saving any instances +# Failed devices will have a status of "SAVED-OFFLINE" and "SAVED-ONLINE" +#offline_failed_save = 0 +#online_failed_save = 0 + # The following are the seconds of each minute # that the timers will run. The defaults should # not need to be changed.. diff --git a/libra/admin_api/__init__.py b/libra/admin_api/__init__.py index bdf64fe0..2addf2fe 100644 --- a/libra/admin_api/__init__.py +++ b/libra/admin_api/__init__.py @@ -139,6 +139,14 @@ cfg.CONF.register_opts( cfg.IntOpt('exists_timer_seconds', default=55, help='Second of each minute exists timer should run'), - ], + cfg.IntOpt('offline_failed_save', + default=0, + help='Number of failed offline instances to save ' + 'for forensic analysis'), + cfg.IntOpt('online_failed_save', + default=0, + help='Number of failed online instances to save ' + 'for forensic analysis'), + ], group=adminapi_group ) diff --git a/libra/admin_api/library/rebuild.py b/libra/admin_api/library/rebuild.py index c86ea7d4..4ddb71d6 100644 --- a/libra/admin_api/library/rebuild.py +++ b/libra/admin_api/library/rebuild.py @@ -17,6 +17,7 @@ from libra.common.api.lbaas import loadbalancers_devices, Vip, Counters from libra.common.api.lbaas import Device, LoadBalancer, db_session from libra.common.api.gearman_client import submit_job, submit_vip_job from libra.openstack.common import log +from oslo.config import cfg LOG = log.getLogger(__name__) @@ -25,6 +26,7 @@ LOG = log.getLogger(__name__) def rebuild_device(device_id): new_device_id = None new_device_name = None + ONLINE_FAILED_SAVE = cfg.CONF['admin_api'].online_failed_save with db_session() as session: new_device = session.query(Device).\ filter(~Device.id.in_( @@ -72,9 +74,14 @@ def rebuild_device(device_id): vip = session.query(Vip).filter(Vip.device == device_id).first() if vip: vip.device = new_device_id + saved_count = session.query(Device).\ + filter(Device.status == 'SAVED-ONLINE').count() device = session.query(Device).\ filter(Device.id == device_id).first() - device.status = 'DELETED' + if ONLINE_FAILED_SAVE > 0 and saved_count < ONLINE_FAILED_SAVE: + device.status = 'SAVED-ONLINE' + else: + device.status = 'DELETED' lbs = session.query(LoadBalancer).\ join(LoadBalancer.devices).\ filter(Device.id == new_device_id).all() diff --git a/libra/admin_api/stats/drivers/database/driver.py b/libra/admin_api/stats/drivers/database/driver.py index a6a7272e..c0039e77 100644 --- a/libra/admin_api/stats/drivers/database/driver.py +++ b/libra/admin_api/stats/drivers/database/driver.py @@ -16,6 +16,7 @@ from libra.common.api.lbaas import Device, LoadBalancer, db_session from libra.common.api.lbaas import loadbalancers_devices from libra.admin_api.library.rebuild import rebuild_device from libra.openstack.common import log +from oslo.config import cfg LOG = log.getLogger(__name__) @@ -50,10 +51,19 @@ class DbDriver(AlertDriver): self._rebuild_device(device_id) def send_delete(self, message, device_id, device_ip, device_name): + OFFLINE_FAILED_SAVE = cfg.CONF['admin_api'].offline_failed_save with db_session() as session: - session.query(Device).\ - filter(Device.id == device_id).\ - update({"status": "DELETED"}, synchronize_session='fetch') + saved_count = session.query(Device).\ + filter(Device.status == 'SAVED-OFFLINE').count() + if OFFLINE_FAILED_SAVE > 0 and saved_count < OFFLINE_FAILED_SAVE: + session.query(Device).\ + filter(Device.id == device_id).\ + update({"status": "SAVED-OFFLINE"},\ + synchronize_session='fetch') + else: + session.query(Device).\ + filter(Device.id == device_id).\ + update({"status": "DELETED"}, synchronize_session='fetch') session.commit() def send_node_change(self, message, lbid, degraded):