From 4d3fd33614cf94f39b9729da7c2d75c355c0712b Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 16 Sep 2013 11:23:35 -0400 Subject: [PATCH] [ALL] Add Gearman KEEPALIVE options This requires the lastest development version of python-gearman to support the new keepalive options. Using these options (off by default) will solve the problem when the Gearman job server gets paused/unpaused in a cloud environment, but the clients and workers never recognize that they are indeed disconnected. WARNING: Not backwards compatible with older versions of the python-gearman driver, even if the SSL and keepalive options are not specified. This is a change in behavior. Change-Id: Ic8db6676f7408364b6fe9a8690deb72bb6e2772c --- doc/admin_api/config.rst | 19 +++++++++++++++++++ doc/api/config.rst | 21 ++++++++++++++++++++- doc/pool_mgm/config.rst | 19 +++++++++++++++++++ doc/worker/config.rst | 23 +++++++++++++++++++++-- etc/sample_libra.cfg | 3 +-- libra/admin_api/app.py | 22 +++++++++++++++++++++- libra/api/app.py | 22 +++++++++++++++++++++- libra/common/api/gearman_client.py | 27 +++++++++++++-------------- libra/mgm/gearman_worker.py | 25 +++++++++++++------------ libra/mgm/mgm.py | 19 +++++++++++++++++-- libra/worker/main.py | 26 ++++++++++++++++++++------ libra/worker/worker.py | 25 +++++++++++++------------ 12 files changed, 198 insertions(+), 53 deletions(-) diff --git a/doc/admin_api/config.rst b/doc/admin_api/config.rst index e12b6301..7aa6b2d2 100644 --- a/doc/admin_api/config.rst +++ b/doc/admin_api/config.rst @@ -55,6 +55,25 @@ Command Line Options The path for the SSL key file to be used for the frontend of the API server + .. option:: --gearman_keepalive + + Use TCP KEEPALIVE to the Gearman job server. Not supported on all + systems. + + .. option:: --gearman_keepcnt + + Maximum number of TCP KEEPALIVE probes to send before killing the + connection to the Gearman job server. + + .. option:: --gearman_keepidle + + Seconds of idle time on the Gearman job server connection before + sending TCP KEEPALIVE probes. + + .. option:: --gearman_keepintvl + + Seconds between TCP KEEPALIVE probes. + .. option:: --gearman_ssl_ca The path for the Gearman SSL Certificate Authority. diff --git a/doc/api/config.rst b/doc/api/config.rst index 18b06775..bdaba0bc 100644 --- a/doc/api/config.rst +++ b/doc/api/config.rst @@ -60,6 +60,25 @@ Command Line Options Used to specify the Gearman job server hostname and port. This option can be used multiple times to specify multiple job servers. + .. option:: --gearman_keepalive + + Use TCP KEEPALIVE to the Gearman job server. Not supported on all + systems. + + .. option:: --gearman_keepcnt + + Maximum number of TCP KEEPALIVE probes to send before killing the + connection to the Gearman job server. + + .. option:: --gearman_keepidle + + Seconds of idle time on the Gearman job server connection before + sending TCP KEEPALIVE probes. + + .. option:: --gearman_keepintvl + + Seconds between TCP KEEPALIVE probes. + .. option:: --gearman_ssl_ca The path for the Gearman SSL Certificate Authority @@ -79,7 +98,7 @@ Command Line Options class. .. option:: --swift_basepath - + The default container to be used for customer log uploads. .. option:: --swift_endpoint diff --git a/doc/pool_mgm/config.rst b/doc/pool_mgm/config.rst index 095cd8f4..5f42de21 100644 --- a/doc/pool_mgm/config.rst +++ b/doc/pool_mgm/config.rst @@ -83,6 +83,25 @@ Command Line Options The flavor ID (image size ID) or name to use for new nodes spun up in the Nova API + .. option:: --gearman_keepalive + + Use TCP KEEPALIVE to the Gearman job server. Not supported on all + systems. + + .. option:: --gearman_keepcnt + + Maximum number of TCP KEEPALIVE probes to send before killing the + connection to the Gearman job server. + + .. option:: --gearman_keepidle + + Seconds of idle time on the Gearman job server connection before + sending TCP KEEPALIVE probes. + + .. option:: --gearman_keepintvl + + Seconds between TCP KEEPALIVE probes. + .. option:: --gearman_ssl_ca The path for the Gearman SSL Certificate Authority. diff --git a/doc/worker/config.rst b/doc/worker/config.rst index 7cb4d672..ffc57144 100644 --- a/doc/worker/config.rst +++ b/doc/worker/config.rst @@ -28,6 +28,25 @@ Command Line Options * *haproxy* - `HAProxy `_ software load balancer. This is the default driver. + .. option:: --gearman_keepalive + + Use TCP KEEPALIVE to the Gearman job server. Not supported on all + systems. + + .. option:: --gearman_keepcnt + + Maximum number of TCP KEEPALIVE probes to send before killing the + connection to the Gearman job server. + + .. option:: --gearman_keepidle + + Seconds of idle time on the Gearman job server connection before + sending TCP KEEPALIVE probes. + + .. option:: --gearman_keepintvl + + Seconds between TCP KEEPALIVE probes. + .. option:: --gearman_ssl_ca Full path to the file with the CA public key to use when @@ -61,9 +80,9 @@ Command Line Options .. option:: --gearman-poll - The number of seconds gearman will poll before re-shuffling its + The number of seconds gearman will poll before re-shuffling its connections. Default is 1 second. - + .. option:: --syslog Send log events to syslog. diff --git a/etc/sample_libra.cfg b/etc/sample_libra.cfg index c3bd5afc..1730d6b0 100644 --- a/etc/sample_libra.cfg +++ b/etc/sample_libra.cfg @@ -28,9 +28,8 @@ user = libra group = libra driver = haproxy reconnect_sleep = 60 -stats_poll = 300 gearman_poll = 60 -server = 10.0.0.1:8080 10.0.0.2:8080 +server = 10.0.0.1:4730 10.0.0.2:4730 pid = /var/run/libra/libra_worker.pid logfile = /var/log/libra/libra_worker.log diff --git a/libra/admin_api/app.py b/libra/admin_api/app.py index b2313f86..4e6eef17 100644 --- a/libra/admin_api/app.py +++ b/libra/admin_api/app.py @@ -51,7 +51,11 @@ def setup_app(pecan_config, args): 'server': args.gearman, 'ssl_key': args.gearman_ssl_key, 'ssl_cert': args.gearman_ssl_cert, - 'ssl_ca': args.gearman_ssl_ca + 'ssl_ca': args.gearman_ssl_ca, + 'keepalive': args.gearman_keepalive, + 'keepcnt': args.gearman_keepcnt, + 'keepidle': args.gearman_keepidle, + 'keepintvl': args.gearman_keepintvl } config['conffile'] = args.config if args.debug: @@ -136,6 +140,22 @@ def main(): '--gearman', action='append', metavar='HOST:PORT', default=[], help='Gearman job servers' ) + options.parser.add_argument( + '--gearman_keepalive', action="store_true", + help='use KEEPALIVE to Gearman server' + ) + options.parser.add_argument( + '--gearman_keepcnt', type=int, metavar='COUNT', + help='max keepalive probes to send before killing connection' + ) + options.parser.add_argument( + '--gearman_keepidle', type=int, metavar='SECONDS', + help='seconds of idle time before sending keepalive probes' + ) + options.parser.add_argument( + '--gearman_keepintvl', type=int, metavar='SECONDS', + help='seconds between TCP keepalive probes' + ) options.parser.add_argument( '--gearman_ssl_ca', metavar='FILE', help='Gearman SSL certificate authority' diff --git a/libra/api/app.py b/libra/api/app.py index 38e60a42..4c21ac45 100644 --- a/libra/api/app.py +++ b/libra/api/app.py @@ -56,7 +56,11 @@ def setup_app(pecan_config, args): 'server': args.gearman, 'ssl_key': args.gearman_ssl_key, 'ssl_cert': args.gearman_ssl_cert, - 'ssl_ca': args.gearman_ssl_ca + 'ssl_ca': args.gearman_ssl_ca, + 'keepalive': args.gearman_keepalive, + 'keepcnt': args.gearman_keepcnt, + 'keepidle': args.gearman_keepidle, + 'keepintvl': args.gearman_keepintvl } config['ip_filters'] = args.ip_filters if args.debug: @@ -115,6 +119,22 @@ def main(): '--gearman', action='append', metavar='HOST:PORT', default=[], help='Gearman job servers' ) + options.parser.add_argument( + '--gearman_keepalive', action="store_true", + help='use KEEPALIVE to Gearman server' + ) + options.parser.add_argument( + '--gearman_keepcnt', type=int, metavar='COUNT', + help='max keepalive probes to send before killing connection' + ) + options.parser.add_argument( + '--gearman_keepidle', type=int, metavar='SECONDS', + help='seconds of idle time before sending keepalive probes' + ) + options.parser.add_argument( + '--gearman_keepintvl', type=int, metavar='SECONDS', + help='seconds between TCP keepalive probes' + ) options.parser.add_argument( '--gearman_ssl_ca', metavar='FILE', help='Gearman SSL certificate authority' diff --git a/libra/common/api/gearman_client.py b/libra/common/api/gearman_client.py index 4f8dda7c..513e037f 100644 --- a/libra/common/api/gearman_client.py +++ b/libra/common/api/gearman_client.py @@ -84,20 +84,19 @@ class GearmanClientThread(object): self.host = host self.lbid = lbid - if all([conf.gearman.ssl_key, conf.gearman.ssl_cert, - conf.gearman.ssl_ca]): - # Use SSL connections to each Gearman job server. - ssl_server_list = [] - for server in conf.gearman.server: - ghost, gport = server.split(':') - ssl_server_list.append({'host': ghost, - 'port': int(gport), - 'keyfile': conf.gearman.ssl_key, - 'certfile': conf.gearman.ssl_cert, - 'ca_certs': conf.gearman.ssl_ca}) - self.gearman_client = JSONGearmanClient(ssl_server_list) - else: - self.gearman_client = JSONGearmanClient(conf.gearman.server) + server_list = [] + for server in conf.gearman.server: + ghost, gport = server.split(':') + server_list.append({'host': ghost, + 'port': int(gport), + 'keyfile': conf.gearman.ssl_key, + 'certfile': conf.gearman.ssl_cert, + 'ca_certs': conf.gearman.ssl_ca, + 'keepalive': conf.gearman.keepalive, + 'keepcnt': conf.gearman.keepcnt, + 'keepidle': conf.gearman.keepidle, + 'keepintvl': conf.gearman.keepintvl}) + self.gearman_client = JSONGearmanClient(server_list) def send_assign(self, data): job_data = { diff --git a/libra/mgm/gearman_worker.py b/libra/mgm/gearman_worker.py index 0a8b0449..e2f3ac56 100644 --- a/libra/mgm/gearman_worker.py +++ b/libra/mgm/gearman_worker.py @@ -34,18 +34,19 @@ def worker_thread(logger, args): logger.info("Registering task libra_pool_mgm") hostname = socket.gethostname() - if all([args.gearman_ssl_key, args.gearman_ssl_cert, args.gearman_ssl_ca]): - ssl_server_list = [] - for host_port in args.gearman: - host, port = host_port.split(':') - ssl_server_list.append({'host': host, - 'port': int(port), - 'keyfile': args.gearman_ssl_key, - 'certfile': args.gearman_ssl_cert, - 'ca_certs': args.gearman_ssl_ca}) - worker = JSONGearmanWorker(ssl_server_list) - else: - worker = JSONGearmanWorker(args.gearman) + server_list = [] + for host_port in args.gearman: + host, port = host_port.split(':') + server_list.append({'host': host, + 'port': int(port), + 'keyfile': args.gearman_ssl_key, + 'certfile': args.gearman_ssl_cert, + 'ca_certs': args.gearman_ssl_ca, + 'keepalive': args.gearman_keepalive, + 'keepcnt': args.gearman_keepcnt, + 'keepidle': args.gearman_keepidle, + 'keepintvl': args.gearman_keepintvl}) + worker = JSONGearmanWorker(server_list) worker.set_client_id(hostname) worker.register_task('libra_pool_mgm', handler) diff --git a/libra/mgm/mgm.py b/libra/mgm/mgm.py index 946dc215..2167feee 100644 --- a/libra/mgm/mgm.py +++ b/libra/mgm/mgm.py @@ -118,6 +118,22 @@ def main(): '--gearman', action='append', metavar='HOST:PORT', default=[], help='Gearman job servers' ) + options.parser.add_argument( + '--gearman_keepalive', action="store_true", + help='use KEEPALIVE to Gearman server' + ) + options.parser.add_argument( + '--gearman_keepcnt', type=int, metavar='COUNT', + help='max keepalive probes to send before killing connection' + ) + options.parser.add_argument( + '--gearman_keepidle', type=int, metavar='SECONDS', + help='seconds of idle time before sending keepalive probes' + ) + options.parser.add_argument( + '--gearman_keepintvl', type=int, metavar='SECONDS', + help='seconds between TCP keepalive probes' + ) options.parser.add_argument( '--gearman_ssl_ca', metavar='FILE', help='Gearman SSL certificate authority' @@ -131,8 +147,7 @@ def main(): help='Gearman SSL key' ) options.parser.add_argument( - '--gearman-poll', - dest='gearman_poll', type=int, metavar='TIME', + '--gearman_poll', type=int, metavar='TIME', default=1, help='Gearman worker polling timeout' ) options.parser.add_argument( diff --git a/libra/worker/main.py b/libra/worker/main.py index b9992f36..efff249f 100644 --- a/libra/worker/main.py +++ b/libra/worker/main.py @@ -70,6 +70,22 @@ def main(): choices=known_drivers.keys(), default='haproxy', help='type of device to use' ) + options.parser.add_argument( + '--gearman_keepalive', action="store_true", + help='use KEEPALIVE to Gearman server' + ) + options.parser.add_argument( + '--gearman_keepcnt', type=int, metavar='COUNT', + help='max keepalive probes to send before killing connection' + ) + options.parser.add_argument( + '--gearman_keepidle', type=int, metavar='SECONDS', + help='seconds of idle time before sending keepalive probes' + ) + options.parser.add_argument( + '--gearman_keepintvl', type=int, metavar='SECONDS', + help='seconds between TCP keepalive probes' + ) options.parser.add_argument( '--gearman_ssl_ca', dest='gearman_ssl_ca', metavar='FILE', help='Gearman SSL certificate authority' @@ -83,13 +99,12 @@ def main(): help='Gearman SSL key' ) options.parser.add_argument( - '--haproxy-service', dest='haproxy_service', + '--haproxy_service', choices=haproxy_services.keys(), default='ubuntu', help='os services to use with HAProxy driver (when used)' ) options.parser.add_argument( - '-s', '--reconnect_sleep', - dest='reconnect_sleep', type=int, metavar='TIME', + '-s', '--reconnect_sleep', type=int, metavar='TIME', default=60, help='seconds to sleep between job server reconnects' ) options.parser.add_argument( @@ -98,12 +113,11 @@ def main(): help='add a Gearman job server to the connection list' ) options.parser.add_argument( - '--stats-poll', dest='stats_poll', type=int, metavar='TIME', + '--stats_poll', type=int, metavar='TIME', default=300, help='statistics polling interval in seconds' ) options.parser.add_argument( - '--gearman-poll', - dest='gearman_poll', type=int, metavar='TIME', + '--gearman_poll', type=int, metavar='TIME', default=1, help='Gearman worker polling timeout' ) args = options.run() diff --git a/libra/worker/worker.py b/libra/worker/worker.py index c95b1481..97e36488 100644 --- a/libra/worker/worker.py +++ b/libra/worker/worker.py @@ -63,19 +63,20 @@ def config_thread(logger, driver, args): hostname = socket.gethostname() logger.info("[worker] Registering task %s" % hostname) - if all([args.gearman_ssl_key, args.gearman_ssl_cert, args.gearman_ssl_ca]): - ssl_server_list = [] - for host_port in args.server: - host, port = host_port.split(':') - ssl_server_list.append({'host': host, - 'port': int(port), - 'keyfile': args.gearman_ssl_key, - 'certfile': args.gearman_ssl_cert, - 'ca_certs': args.gearman_ssl_ca}) - worker = CustomJSONGearmanWorker(ssl_server_list) - else: - worker = CustomJSONGearmanWorker(args.server) + server_list = [] + for host_port in args.server: + host, port = host_port.split(':') + server_list.append({'host': host, + 'port': int(port), + 'keyfile': args.gearman_ssl_key, + 'certfile': args.gearman_ssl_cert, + 'ca_certs': args.gearman_ssl_ca, + 'keepalive': args.gearman_keepalive, + 'keepcnt': args.gearman_keepcnt, + 'keepidle': args.gearman_keepidle, + 'keepintvl': args.gearman_keepintvl}) + worker = CustomJSONGearmanWorker(server_list) worker.set_client_id(hostname) worker.register_task(hostname, handler) worker.logger = logger