backporting...

Change-Id: I894cfb152297dd175105d8f35023a090bc5d8bb5
This commit is contained in:
German Eichberger
2014-06-23 11:26:01 -07:00
committed by Jamie McCarthy
parent 43e8447bdd
commit 80e7bae30c
9 changed files with 387 additions and 384 deletions

View File

@@ -16,14 +16,14 @@ import ipaddress
import threading import threading
from datetime import datetime from datetime import datetime
from gearman.constants import JOB_UNKNOWN
from oslo.config import cfg from oslo.config import cfg
from sqlalchemy import func from sqlalchemy import func
from libra.common.api.lbaas import Device, PoolBuilding, Vip, db_session from libra.common.api.lbaas import Device, PoolBuilding, Vip, db_session
from libra.common.api.lbaas import Counters from libra.common.api.lbaas import Counters
from libra.common.json_gearman import JsonJob from libra.common.json_gearman import JSONGearmanClient
from libra.openstack.common import log from libra.openstack.common import log
import gear
# TODO: Lots of duplication of code here, need to cleanup # TODO: Lots of duplication of code here, need to cleanup
@@ -31,6 +31,7 @@ LOG = log.getLogger(__name__)
class Pool(object): class Pool(object):
DELETE_SECONDS = cfg.CONF['admin_api'].delete_timer_seconds DELETE_SECONDS = cfg.CONF['admin_api'].delete_timer_seconds
PROBE_SECONDS = cfg.CONF['admin_api'].probe_timer_seconds PROBE_SECONDS = cfg.CONF['admin_api'].probe_timer_seconds
VIPS_SECONDS = cfg.CONF['admin_api'].vips_timer_seconds VIPS_SECONDS = cfg.CONF['admin_api'].vips_timer_seconds
@@ -48,8 +49,6 @@ class Pool(object):
self.start_probe_sched() self.start_probe_sched()
self.start_vips_sched() self.start_vips_sched()
self.gear = GearmanWork() # set up the async gearman
def shutdown(self): def shutdown(self):
if self.probe_timer: if self.probe_timer:
self.probe_timer.cancel() self.probe_timer.cancel()
@@ -67,6 +66,7 @@ class Pool(object):
return return
LOG.info('Running device delete check') LOG.info('Running device delete check')
try: try:
message = []
with db_session() as session: with db_session() as session:
devices = session.query(Device).\ devices = session.query(Device).\
filter(Device.status == 'DELETED').all() filter(Device.status == 'DELETED').all()
@@ -76,12 +76,17 @@ class Pool(object):
'action': 'DELETE_DEVICE', 'action': 'DELETE_DEVICE',
'name': device.name 'name': device.name
} }
self.gear.send_delete_message(job_data) message.append(dict(task='libra_pool_mgm', data=job_data))
counter = session.query(Counters).\ counter = session.query(Counters).\
filter(Counters.name == 'devices_deleted').first() filter(Counters.name == 'devices_deleted').first()
counter.value += len(devices) counter.value += len(devices)
session.commit() session.commit()
if not message:
LOG.info("No devices to delete")
else:
gear = GearmanWork()
gear.send_delete_message(message)
except: except:
LOG.exception("Exception when deleting devices") LOG.exception("Exception when deleting devices")
@@ -168,14 +173,24 @@ class Pool(object):
self.start_probe_sched() self.start_probe_sched()
def _build_nodes(self, count): def _build_nodes(self, count):
message = []
it = 0
job_data = {'action': 'BUILD_DEVICE'} job_data = {'action': 'BUILD_DEVICE'}
for it in range(0, count): while it < count:
self.gear.send_create_message(job_data) message.append(dict(task='libra_pool_mgm', data=job_data))
it += 1
gear = GearmanWork()
gear.send_create_message(message)
def _build_vips(self, count): def _build_vips(self, count):
message = []
it = 0
job_data = {'action': 'BUILD_IP'} job_data = {'action': 'BUILD_IP'}
for it in range(0, count): while it < count:
self.gear.send_vips_message(job_data) message.append(dict(task='libra_pool_mgm', data=job_data))
it += 1
gear = GearmanWork()
gear.send_vips_message(message)
def start_probe_sched(self): def start_probe_sched(self):
seconds = datetime.now().second seconds = datetime.now().second
@@ -213,18 +228,111 @@ class Pool(object):
class GearmanWork(object): class GearmanWork(object):
class VIPClient(gear.Client): def __init__(self):
def handleWorkComplete(self, packet): server_list = []
job = super(GearmanWork.VIPClient, self).handleWorkComplete(packet) for server in cfg.CONF['gearman']['servers']:
try: host, port = server.split(':')
if job.msg['response'] == 'FAIL': server_list.append({'host': host,
'port': int(port),
'keyfile': cfg.CONF['gearman']['ssl_key'],
'certfile': cfg.CONF['gearman']['ssl_cert'],
'ca_certs': cfg.CONF['gearman']['ssl_ca'],
'keepalive': cfg.CONF['gearman']['keepalive'],
'keepcnt': cfg.CONF['gearman']['keepcnt'],
'keepidle': cfg.CONF['gearman']['keepidle'],
'keepintvl': cfg.CONF['gearman']['keepintvl']
})
self.gearman_client = JSONGearmanClient(server_list)
def send_delete_message(self, message):
LOG.info("Sending %d gearman messages", len(message))
job_status = self.gearman_client.submit_multiple_jobs(
message, background=False, wait_until_complete=True,
max_retries=10, poll_timeout=30.0
)
delete_count = 0
for status in job_status:
if status.state == JOB_UNKNOWN:
LOG.error('Gearman Job server fail')
continue
if status.timed_out:
LOG.error('Gearman timeout whilst deleting device')
continue
if status.result['response'] == 'FAIL':
LOG.error(
'Pool manager failed to delete a device, removing from DB'
)
delete_count += 1
with db_session() as session:
session.query(Device).\
filter(Device.name == status.result['name']).delete()
session.commit()
LOG.info('%d freed devices delete from pool', delete_count)
def send_vips_message(self, message):
# TODO: make this gearman part more async, not wait for all builds
LOG.info("Sending %d gearman messages", len(message))
job_status = self.gearman_client.submit_multiple_jobs(
message, background=False, wait_until_complete=True,
max_retries=10, poll_timeout=3600.0
)
built_count = 0
for status in job_status:
if status.state == JOB_UNKNOWN:
LOG.error('Gearman Job server fail')
continue
if status.timed_out:
LOG.error('Gearman timeout whilst building vip')
continue
if status.result['response'] == 'FAIL':
LOG.error('Pool manager failed to build a vip') LOG.error('Pool manager failed to build a vip')
else: continue
self._add_vip(job.msg)
built_count += 1
try:
self._add_vip(status.result)
except: except:
LOG.exception( LOG.exception(
'Could not add vip to DB, node data: {0}' 'Could not add vip to DB, node data: {0}'
.format(job.msg) .format(status.result)
)
LOG.info(
'{vips} vips built and added to pool'.format(vips=built_count)
)
def send_create_message(self, message):
# TODO: make this gearman part more async, not wait for all builds
LOG.info("Sending {0} gearman messages".format(len(message)))
job_status = self.gearman_client.submit_multiple_jobs(
message, background=False, wait_until_complete=True,
max_retries=10, poll_timeout=3600.0
)
built_count = 0
for status in job_status:
if status.state == JOB_UNKNOWN:
LOG.error('Gearman Job server fail')
continue
if status.timed_out:
LOG.error('Gearman timeout whilst building device')
continue
if status.result['response'] == 'FAIL':
LOG.error('Pool manager failed to build a device')
if 'name' in status.result:
self._add_bad_node(status.result)
continue
built_count += 1
try:
self._add_node(status.result)
except:
LOG.exception(
'Could not add node to DB, node data: {0}'
.format(status.result)
)
LOG.info(
'{nodes} devices built and added to pool'.format(nodes=built_count)
) )
def _add_vip(self, data): def _add_vip(self, data):
@@ -238,41 +346,6 @@ class GearmanWork(object):
counter.value += 1 counter.value += 1
session.commit() session.commit()
class DeleteClient(gear.Client):
def handleWorkComplete(self, packet):
job = super(GearmanWork.DeleteClient,
self).handleWorkComplete(packet)
if job.msg['response'] == 'FAIL':
LOG.error(
'Pool manager failed to delete a device, removing from DB')
self._delete_from_db(job.msg)
def _delete_from_db(self, msg):
with db_session() as session:
session.query(Device). \
filter(Device.name == msg['name']).delete()
session.commit()
LOG.info("Delete device %s" % msg['name'])
class CreateClient(gear.Client):
def handleWorkComplete(self, packet):
job = super(GearmanWork.CreateClient,
self).handleWorkComplete(packet)
try:
if job.msg['response'] == 'FAIL':
LOG.error('Pool manager failed to build a device')
if 'name' in job.msg:
self._add_bad_node(job.msg)
else:
self._add_node(job.msg)
except:
LOG.exception(
'Could not add node to DB, node data: {0}'
.format(job.msg)
)
def _add_node(self, data): def _add_node(self, data):
LOG.info('Adding device {0} to DB'.format(data['name'])) LOG.info('Adding device {0} to DB'.format(data['name']))
device = Device() device = Device()
@@ -294,7 +367,8 @@ class GearmanWork(object):
def _add_bad_node(self, data): def _add_bad_node(self, data):
LOG.info( LOG.info(
"Adding bad device {0} to DB to be deleted" % (data['name'])) 'Adding bad device {0} to DB to be deleted'.format(data['name'])
)
device = Device() device = Device()
device.name = data['name'] device.name = data['name']
device.publicIpAddr = data['addr'] device.publicIpAddr = data['addr']
@@ -311,28 +385,3 @@ class GearmanWork(object):
filter(Counters.name == 'devices_bad_built').first() filter(Counters.name == 'devices_bad_built').first()
counter.value += 1 counter.value += 1
session.commit() session.commit()
def __init__(self):
self.vip_client = GearmanWork.VIPClient("Vip Client")
self.delete_client = GearmanWork.DeleteClient("Delete Client")
self.create_client = GearmanWork.CreateClient("Create Client")
for x in [self.vip_client, self.create_client, self.delete_client]:
self._init_client(x)
def _init_client(self, client):
client.log = LOG
for server in cfg.CONF['gearman']['servers']:
host, port = server.split(':')
client.addServer(host, port, cfg.CONF['gearman']['ssl_key'],
cfg.CONF['gearman']['ssl_cert'],
cfg.CONF['gearman']['ssl_ca'])
def send_delete_message(self, message, name='libra_pool_mgm'):
self.delete_client.submitJob(JsonJob(name, message))
def send_vips_message(self, message, name='libra_pool_mgm'):
self.vip_client.submitJob(JsonJob(name, message))
def send_create_message(self, message, name='libra_pool_mgm'):
self.create_client.submitJob(JsonJob(name, message))

View File

@@ -37,7 +37,6 @@ class OfflineStats(object):
self.server_id = cfg.CONF['admin_api']['server_id'] self.server_id = cfg.CONF['admin_api']['server_id']
self.number_of_servers = cfg.CONF['admin_api']['number_of_servers'] self.number_of_servers = cfg.CONF['admin_api']['number_of_servers']
self.gearman = GearJobs()
self.start_offline_sched() self.start_offline_sched()
def shutdown(self): def shutdown(self):
@@ -81,7 +80,8 @@ class OfflineStats(object):
return (0, 0) return (0, 0)
for lb in devices: for lb in devices:
node_list.append(lb.name) node_list.append(lb.name)
failed_lbs = self.gearman.offline_check(node_list) gearman = GearJobs()
failed_lbs = gearman.offline_check(node_list)
failed = len(failed_lbs) failed = len(failed_lbs)
if failed > self.error_limit: if failed > self.error_limit:
LOG.error( LOG.error(

View File

@@ -36,7 +36,6 @@ class PingStats(object):
self.stats_driver = cfg.CONF['admin_api']['stats_driver'] self.stats_driver = cfg.CONF['admin_api']['stats_driver']
LOG.info("Selected stats drivers: %s", self.stats_driver) LOG.info("Selected stats drivers: %s", self.stats_driver)
self.gearman = GearJobs()
self.start_ping_sched() self.start_ping_sched()
def shutdown(self): def shutdown(self):
@@ -76,7 +75,8 @@ class PingStats(object):
return (0, 0) return (0, 0)
for lb in devices: for lb in devices:
node_list.append(lb.name) node_list.append(lb.name)
failed_lbs, node_status = self.gearman.send_pings(node_list) gearman = GearJobs()
failed_lbs, node_status = gearman.send_pings(node_list)
failed = len(failed_lbs) failed = len(failed_lbs)
if failed > self.error_limit: if failed > self.error_limit:
LOG.error( LOG.error(

View File

@@ -12,238 +12,213 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
from gearman.constants import JOB_UNKNOWN
from oslo.config import cfg from oslo.config import cfg
from libra.common.json_gearman import JSONGearmanClient
from libra.openstack.common import log from libra.openstack.common import log
import gear
from libra.common.json_gearman import JsonJob
import time
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
class GearJobs(object): class GearJobs(object):
class DisconnectClient(gear.Client):
def handleDisconnect(self, job):
job.disconnect = True
class DisconnectJob(JsonJob):
def __init__(self, name, msg, unique=None):
super(GearJobs.DisconnectJob, self).__init__(name, msg, unique)
self.disconnect = False
def __init__(self): def __init__(self):
self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']
self.gm_client = gear.Client("stats") server_list = []
self.gm_client.log = LOG
for server in cfg.CONF['gearman']['servers']: for server in cfg.CONF['gearman']['servers']:
host, port = server.split(':') host, port = server.split(':')
self.gm_client.addServer(host, port, server_list.append({'host': host,
cfg.CONF['gearman']['ssl_key'], 'port': int(port),
cfg.CONF['gearman']['ssl_cert'], 'keyfile': cfg.CONF['gearman']['ssl_key'],
cfg.CONF['gearman']['ssl_ca']) 'certfile': cfg.CONF['gearman']['ssl_cert'],
'ca_certs': cfg.CONF['gearman']['ssl_ca'],
def _all_complete(self, jobs): 'keepalive': cfg.CONF['gearman']['keepalive'],
for job in jobs: 'keepcnt': cfg.CONF['gearman']['keepcnt'],
if not (job.complete or job.disconnect): 'keepidle': cfg.CONF['gearman']['keepidle'],
return False 'keepintvl': cfg.CONF['gearman']['keepintvl']
return True })
self.gm_client = JSONGearmanClient(server_list)
def _wait(self, pings):
poll_count = 0
while not self._all_complete(pings) and poll_count < self.poll_retry:
# wait for jobs
time.sleep(self.poll_timeout)
poll_count += 1
def send_pings(self, node_list): def send_pings(self, node_list):
# TODO: lots of duplicated code that needs cleanup
list_of_jobs = []
failed_list = [] failed_list = []
node_status = dict() node_status = dict()
retry_list = [] retry_list = []
submitted_pings = []
# The message name is STATS for historical reasons. Real # The message name is STATS for historical reasons. Real
# data statistics are gathered with METRICS messages. # data statistics are gathered with METRICS messages.
job_data = {"hpcs_action": "STATS"} job_data = {"hpcs_action": "STATS"}
for node in node_list: for node in node_list:
job = GearJobs.DisconnectJob(str(node), job_data) list_of_jobs.append(dict(task=str(node), data=job_data))
self.gm_client.submitJob(job) submitted_pings = self.gm_client.submit_multiple_jobs(
submitted_pings.append(job) list_of_jobs, background=False, wait_until_complete=True,
poll_timeout=self.poll_timeout
self._wait(submitted_pings) )
for ping in submitted_pings: for ping in submitted_pings:
if ping.disconnect: if ping.state == JOB_UNKNOWN:
# TODO: Gearman server failed, ignoring for now # TODO: Gearman server failed, ignoring for now
LOG.error('Gearman Job server fail') LOG.error('Gearman Job server fail')
continue continue
if not ping.complete: if ping.timed_out:
# Ping timeout # Ping timeout
retry_list.append(ping) retry_list.append(ping.job.task)
continue continue
if ping.msg['hpcs_response'] == 'FAIL': if ping.result['hpcs_response'] == 'FAIL':
if ( if (
'status' in ping.result and 'status' in ping.result and
ping.msg['status'] == 'DELETED' ping.result['status'] == 'DELETED'
): ):
continue continue
# Error returned by Gearman # Error returned by Gearman
failed_list.append(ping) failed_list.append(ping.job.task)
continue continue
else: else:
if 'nodes' in ping.msg: if 'nodes' in ping.result:
node_status[ping.name] = ping.msg['nodes'] node_status[ping.job.task] = ping.result['nodes']
submitted_pings = [] list_of_jobs = []
if len(retry_list) > 0: if len(retry_list) > 0:
LOG.info( LOG.info(
"{0} pings timed out, retrying".format(len(retry_list)) "{0} pings timed out, retrying".format(len(retry_list))
) )
for node in retry_list: for node in retry_list:
job = GearJobs.DisconnectJob(node.name, node.msg) list_of_jobs.append(dict(task=str(node), data=job_data))
self.gm_client.submitJob(job) submitted_pings = self.gm_client.submit_multiple_jobs(
submitted_pings.append(job) list_of_jobs, background=False, wait_until_complete=True,
poll_timeout=self.poll_retry
self._wait(submitted_pings) )
for ping in submitted_pings: for ping in submitted_pings:
if ping.disconnect: if ping.state == JOB_UNKNOWN:
# TODO: Gearman server failed, ignoring for now # TODO: Gearman server failed, ignoring for now
LOG.error('Gearman Job server fail') LOG.error('Gearman Job server fail')
continue continue
if not ping.complete: if ping.timed_out:
# Ping timeout # Ping timeout
failed_list.append(ping.name) failed_list.append(ping.job.task)
continue continue
if ping.msg['hpcs_response'] == 'FAIL': if ping.result['hpcs_response'] == 'FAIL':
if ( if (
'status' in ping.msg and 'status' in ping.result and
ping.msg['status'] == 'DELETED' ping.result['status'] == 'DELETED'
): ):
continue continue
# Error returned by Gearman # Error returned by Gearman
failed_list.append(ping.name) failed_list.append(ping.job.task)
continue continue
else: else:
if 'nodes' in ping.result: if 'nodes' in ping.result:
node_status[ping.name] = ping.msg['nodes'] node_status[ping.job.task] = ping.result['nodes']
return failed_list, node_status return failed_list, node_status
def offline_check(self, node_list): def offline_check(self, node_list):
list_of_jobs = []
failed_list = [] failed_list = []
submitted_pings = []
job_data = {"hpcs_action": "DIAGNOSTICS"} job_data = {"hpcs_action": "DIAGNOSTICS"}
for node in node_list: for node in node_list:
job = GearJobs.DisconnectJob(str(node), job_data) list_of_jobs.append(dict(task=str(node), data=job_data))
self.gm_client.submitJob(job) submitted_pings = self.gm_client.submit_multiple_jobs(
submitted_pings.append(job) list_of_jobs, background=False, wait_until_complete=True,
poll_timeout=self.poll_timeout
self._wait(submitted_pings) )
for ping in submitted_pings: for ping in submitted_pings:
if ping.disconnect: if ping.state == JOB_UNKNOWN:
LOG.error( LOG.error(
"Gearman Job server failed during OFFLINE check of {0}". "Gearman Job server failed during OFFLINE check of {0}".
format(ping.job.task) format(ping.job.task)
) )
elif not ping.complete: elif ping.timed_out:
failed_list.append(ping.name) failed_list.append(ping.job.task)
elif ping.msg['network'] == 'FAIL': elif ping.result['network'] == 'FAIL':
failed_list.append(ping.name) failed_list.append(ping.job.task)
else: else:
gearman_count = 0 gearman_count = 0
gearman_fail = 0 gearman_fail = 0
for gearman_test in ping.msg['gearman']: for gearman_test in ping.result['gearman']:
gearman_count += 1 gearman_count += 1
if gearman_test['status'] == 'FAIL': if gearman_test['status'] == 'FAIL':
gearman_fail += 1 gearman_fail += 1
# Need 2/3rds gearman up # Need 2/3rds gearman up
max_fail_count = gearman_count / 3 max_fail_count = gearman_count / 3
if gearman_fail > max_fail_count: if gearman_fail > max_fail_count:
failed_list.append(ping.name) failed_list.append(ping.job.task)
return failed_list return failed_list
def get_discover(self, name): def get_discover(self, name):
# Used in the v2 devices controller # Used in the v2 devices controller
job_data = {"hpcs_action": "DISCOVER"} job_data = {"hpcs_action": "DISCOVER"}
job = GearJobs.DisconnectJob(str(name), job_data) job = self.gm_client.submit_job(
self.gm_client.submitJob(job, gear.PRECEDENCE_HIGH) str(name), job_data, background=False, wait_until_complete=True,
poll_timeout=10
poll_count = 0 )
while not job.complete and not job.disconnect \ if job.state == JOB_UNKNOWN:
and poll_count < self.poll_retry: # Gearman server failed
# wait for jobs TODO make sure right unit/value
time.sleep(self.poll_timeout)
poll_count += 1
if not job.complete:
return None return None
elif job.timed_out:
if job.result['hpcs_response'] == 'FAIL': # Time out is a fail
return None
elif job.result['hpcs_response'] == 'FAIL':
# Fail response is a fail # Fail response is a fail
return None return None
return job.result return job.result
def get_stats(self, node_list): def get_stats(self, node_list):
# TODO: lots of duplicated code that needs cleanup # TODO: lots of duplicated code that needs cleanup
list_of_jobs = []
failed_list = [] failed_list = []
retry_list = [] retry_list = []
submitted_stats = []
results = {} results = {}
job_data = {"hpcs_action": "METRICS"} job_data = {"hpcs_action": "METRICS"}
for node in node_list: for node in node_list:
job = GearJobs.DisconnectJob(str(node), job_data) list_of_jobs.append(dict(task=str(node), data=job_data))
self.gm_client.submitJob(job) submitted_stats = self.gm_client.submit_multiple_jobs(
submitted_stats.append(job) list_of_jobs, background=False, wait_until_complete=True,
poll_timeout=self.poll_timeout
self._wait(submitted_stats) )
for stats in submitted_stats: for stats in submitted_stats:
if stats.disconnect: if stats.state == JOB_UNKNOWN:
# TODO: Gearman server failed, ignoring for now # TODO: Gearman server failed, ignoring for now
retry_list.append(stats) retry_list.append(stats.job.task)
elif not stats.complete: elif stats.timed_out:
# Timeout # Timeout
retry_list.append(stats) retry_list.append(stats.job.task)
elif stats.msg['hpcs_response'] == 'FAIL': elif stats.result['hpcs_response'] == 'FAIL':
# Error returned by Gearman # Error returned by Gearman
failed_list.append(stats.name) failed_list.append(stats.job.task)
else: else:
# Success # Success
results[stats.name] = stats.msg results[stats.job.task] = stats.result
submitted_stats = [] list_of_jobs = []
if len(retry_list) > 0: if len(retry_list) > 0:
LOG.info( LOG.info(
"{0} Statistics gathering timed out, retrying". "{0} Statistics gathering timed out, retrying".
format(len(retry_list)) format(len(retry_list))
) )
for node in retry_list: for node in retry_list:
job = GearJobs.DisconnectJob(node.name, node.msg) list_of_jobs.append(dict(task=str(node), data=job_data))
self.gm_client.submitJob(job) submitted_stats = self.gm_client.submit_multiple_jobs(
submitted_stats.append(job) list_of_jobs, background=False, wait_until_complete=True,
poll_timeout=self.poll_retry
self._wait(submitted_stats) )
for stats in submitted_stats: for stats in submitted_stats:
if stats.disconnect: if stats.state == JOB_UNKNOWN:
# TODO: Gearman server failed, ignoring for now # TODO: Gearman server failed, ignoring for now
LOG.error( LOG.error(
"Gearman Job server failed gathering statistics " "Gearman Job server failed gathering statistics "
"on {0}".format(stats.job.task) "on {0}".format(stats.job.task)
) )
failed_list.append(stats.name) failed_list.append(stats.job.task)
elif not stats.complete: elif stats.timed_out:
# Timeout # Timeout
failed_list.append(stats.name) failed_list.append(stats.job.task)
elif stats.msg['hpcs_response'] == 'FAIL': elif stats.result['hpcs_response'] == 'FAIL':
# Error returned by Gearman # Error returned by Gearman
failed_list.append(stats.name) failed_list.append(stats.job.task)
else: else:
# Success # Success
results[stats.name] = stats.msg results[stats.job.task] = stats.result
return failed_list, results return failed_list, results

View File

@@ -37,7 +37,6 @@ class UsageStats(object):
self.server_id = cfg.CONF['admin_api']['server_id'] self.server_id = cfg.CONF['admin_api']['server_id']
self.number_of_servers = cfg.CONF['admin_api']['number_of_servers'] self.number_of_servers = cfg.CONF['admin_api']['number_of_servers']
self.stats_freq = cfg.CONF['admin_api'].stats_freq self.stats_freq = cfg.CONF['admin_api'].stats_freq
self.gearman = GearJobs()
self.start_stats_sched() self.start_stats_sched()
@@ -102,7 +101,8 @@ class UsageStats(object):
for device in devices: for device in devices:
node_list.append(device.name) node_list.append(device.name)
failed_list, results = self.gearman.get_stats(node_list) gearman = GearJobs()
failed_list, results = gearman.get_stats(node_list)
failed = len(failed_list) failed = len(failed_list)
if failed > 0: if failed > 0:

View File

@@ -13,22 +13,19 @@
# under the License. # under the License.
import eventlet import eventlet
import gear
import json
eventlet.monkey_patch() eventlet.monkey_patch()
import ipaddress import ipaddress
from libra.common.json_gearman import JSONGearmanClient
from libra.common.api.lbaas import LoadBalancer, db_session, Device, Node, Vip from libra.common.api.lbaas import LoadBalancer, db_session, Device, Node, Vip
from libra.common.api.lbaas import HealthMonitor, Counters from libra.common.api.lbaas import HealthMonitor, Counters
from libra.common.api.lbaas import loadbalancers_devices from libra.common.api.lbaas import loadbalancers_devices
from libra.common.api.mnb import update_mnb from libra.common.api.mnb import update_mnb
from libra.openstack.common import log from libra.openstack.common import log
from pecan import conf from pecan import conf
from time import sleep
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
POLL_COUNT = 10
POLL_SLEEP = 10
gearman_workers = [ gearman_workers = [
'UPDATE', # Create/Update a Load Balancer. 'UPDATE', # Create/Update a Load Balancer.
@@ -42,17 +39,6 @@ gearman_workers = [
] ]
class DisconnectClient(gear.Client):
def handleDisconnect(self, job):
job.disconnect = True
class DisconnectJob(gear.Job):
def __init__(self, name, arguments):
super(DisconnectJob, self).__init__(name, arguments)
self.disconnect = False
def submit_job(job_type, host, data, lbid): def submit_job(job_type, host, data, lbid):
eventlet.spawn_n(client_job, job_type, str(host), data, lbid) eventlet.spawn_n(client_job, job_type, str(host), data, lbid)
@@ -136,15 +122,19 @@ class GearmanClientThread(object):
self.host = host self.host = host
self.lbid = lbid self.lbid = lbid
self.gear_client = DisconnectClient() server_list = []
for server in conf.gearman.server: for server in conf.gearman.server:
ghost, gport = server.split(':') ghost, gport = server.split(':')
self.gear_client.addServer(ghost, server_list.append({'host': ghost,
int(gport), 'port': int(gport),
conf.gearman.ssl_key, 'keyfile': conf.gearman.ssl_key,
conf.gearman.ssl_cert, 'certfile': conf.gearman.ssl_cert,
conf.gearman.ssl_ca) 'ca_certs': conf.gearman.ssl_ca,
'keepalive': conf.gearman.keepalive,
'keepcnt': conf.gearman.keepcnt,
'keepidle': conf.gearman.keepidle,
'keepintvl': conf.gearman.keepintvl})
self.gearman_client = JSONGearmanClient(server_list)
def send_assign(self, data): def send_assign(self, data):
NULL = None # For pep8 NULL = None # For pep8
@@ -532,40 +522,28 @@ class GearmanClientThread(object):
mnb_data["tenantid"]) mnb_data["tenantid"])
def _send_message(self, message, response_name): def _send_message(self, message, response_name):
job_status = self.gearman_client.submit_job(
self.gear_client.waitForServer() self.host, message, background=False, wait_until_complete=True,
max_retries=10, poll_timeout=120.0
job = DisconnectJob(self.host, json.dumps(message)) )
if job_status.state == 'UNKNOWN':
self.gear_client.submitJob(job) # Gearman server connection failed
LOG.error('Could not talk to gearman server')
pollcount = 0 return False, "System error communicating with load balancer"
# Would like to make these config file settings if job_status.timed_out:
while not job.complete and pollcount < POLL_COUNT: # Job timed out
sleep(POLL_SLEEP) LOG.warning(
pollcount += 1 'Gearman timeout talking to {0}'.format(self.host)
)
if job.disconnect:
LOG.error('Gearman Job server fail - disconnect')
return False, "Gearman Job server fail - "\
"disconnect communicating with load balancer"
# We timed out waiting for the job to finish
if not job.complete:
LOG.warning('Gearman timeout talking to {0}'.format(self.host))
return False, "Timeout error communicating with load balancer" return False, "Timeout error communicating with load balancer"
LOG.debug(job_status.result)
result = json.loads(job.data[0]) if 'badRequest' in job_status.result:
error = job_status.result['badRequest']['validationErrors']
LOG.debug(result)
if 'badRequest' in result:
error = result['badRequest']['validationErrors']
return False, error['message'] return False, error['message']
if result[response_name] == 'FAIL': if job_status.result[response_name] == 'FAIL':
# Worker says 'no' # Worker says 'no'
if 'hpcs_error' in result: if 'hpcs_error' in job_status.result:
error = result['hpcs_error'] error = job_status.result['hpcs_error']
else: else:
error = 'Load Balancer error' error = 'Load Balancer error'
LOG.error( LOG.error(
@@ -573,4 +551,4 @@ class GearmanClientThread(object):
) )
return False, error return False, error
LOG.info('Gearman success from {0}'.format(self.host)) LOG.info('Gearman success from {0}'.format(self.host))
return True, result return True, job_status.result

View File

@@ -13,14 +13,28 @@
# under the License. # under the License.
import json import json
import gear from gearman import GearmanClient, GearmanWorker, DataEncoder
# Here is the good stuff
class JsonJob(gear.Job):
def __init__(self, name, msg, unique=None):
super(JsonJob, self).__init__(name, json.dumps(msg), unique)
@property class JSONDataEncoder(DataEncoder):
def msg(self): """ Class to transform data that the worker either receives or sends. """
if self.data:
return json.loads(self.data[0]) @classmethod
def encode(cls, encodable_object):
""" Encode JSON object as string """
return json.dumps(encodable_object)
@classmethod
def decode(cls, decodable_string):
""" Decode string to JSON object """
return json.loads(decodable_string)
class JSONGearmanWorker(GearmanWorker):
""" Overload the Gearman worker class so we can set the data encoder. """
data_encoder = JSONDataEncoder
class JSONGearmanClient(GearmanClient):
""" Overload the Gearman client class so we can set the data encoder. """
data_encoder = JSONDataEncoder

View File

@@ -12,32 +12,20 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
import gear
import json
from time import sleep from time import sleep
from novaclient import exceptions from novaclient import exceptions
from oslo.config import cfg from oslo.config import cfg
from gearman.constants import JOB_UNKNOWN
from libra.openstack.common import log from libra.openstack.common import log
from libra.common.json_gearman import JSONGearmanClient
from libra.mgm.nova import Node, BuildError, NotFound from libra.mgm.nova import Node, BuildError, NotFound
POLL_COUNT = 10
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
class DisconnectClient(gear.Client):
def handleDisconnect(self, job):
job.disconnect = True
class DisconnectJob(gear.Job):
def __init__(self, name, arguments):
super(DisconnectJob, self).__init__(name, arguments)
self.disconnect = False
class BuildController(object): class BuildController(object):
RESPONSE_FIELD = 'response' RESPONSE_FIELD = 'response'
RESPONSE_SUCCESS = 'PASS' RESPONSE_SUCCESS = 'PASS'
RESPONSE_FAILURE = 'FAIL' RESPONSE_FAILURE = 'FAIL'
@@ -141,56 +129,44 @@ class BuildController(object):
def _test_node(self, name): def _test_node(self, name):
""" Run diags on node, blow it away if bad """ """ Run diags on node, blow it away if bad """
server_list = []
client = DisconnectClient()
for server in cfg.CONF['gearman']['servers']: for server in cfg.CONF['gearman']['servers']:
host, port = server.split(':') host, port = server.split(':')
client.addServer(host, server_list.append({'host': host,
int(port), 'port': int(port),
cfg.CONF['gearman']['ssl_key'], 'keyfile': cfg.CONF['gearman']['ssl_key'],
cfg.CONF['gearman']['ssl_cert'], 'certfile': cfg.CONF['gearman']['ssl_cert'],
cfg.CONF['gearman']['ssl_ca']) 'ca_certs': cfg.CONF['gearman']['ssl_ca'],
'keepalive': cfg.CONF['gearman']['keepalive'],
client.waitForServer() 'keepcnt': cfg.CONF['gearman']['keepcnt'],
'keepidle': cfg.CONF['gearman']['keepidle'],
'keepintvl': cfg.CONF['gearman']['keepintvl']})
gm_client = JSONGearmanClient(server_list)
job_data = {'hpcs_action': 'DIAGNOSTICS'} job_data = {'hpcs_action': 'DIAGNOSTICS'}
job_status = gm_client.submit_job(
job = DisconnectJob(str(name), json.dumps(job_data)) str(name), job_data, background=False, wait_until_complete=True,
max_retries=10, poll_timeout=10
client.submitJob(job) )
if job_status.state == JOB_UNKNOWN:
pollcount = 0 # Gearman server connect fail, count as bad node because we can't
pollsleepinterval = cfg.CONF['mgm']['build_diag_timeout'] / POLL_COUNT # tell if it really is working
while not job.complete\ LOG.error('Could not talk to gearman server')
and pollcount < POLL_COUNT\
and not job.disconnect:
sleep(pollsleepinterval)
pollcount += 1
if job.disconnect:
LOG.error('Gearman Job server fail - disconnect')
return False return False
if job_status.timed_out:
# We timed out waiting for the job to finish
if not job.complete:
LOG.warning('Timeout getting diags from {0}'.format(name)) LOG.warning('Timeout getting diags from {0}'.format(name))
return False return False
LOG.debug(job_status.result)
result = json.loads(job.data[0])
LOG.debug(result)
# Would only happen if DIAGNOSTICS call not supported # Would only happen if DIAGNOSTICS call not supported
if result['hpcs_response'] == 'FAIL': if job_status.result['hpcs_response'] == 'FAIL':
return True return True
if result['network'] == 'FAIL': if job_status.result['network'] == 'FAIL':
return False return False
gearman_count = 0 gearman_count = 0
gearman_fail = 0 gearman_fail = 0
for gearman_test in result['gearman']: for gearman_test in job_status.result['gearman']:
gearman_count += 1 gearman_count += 1
if gearman_test['status'] == 'FAIL': if gearman_test['status'] == 'FAIL':
LOG.info( LOG.info(

View File

@@ -12,11 +12,14 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
import gear import gearman.errors
import json import json
import socket import socket
import time
from oslo.config import cfg from oslo.config import cfg
from libra.common.json_gearman import JSONGearmanWorker
from libra.mgm.controllers.root import PoolMgmController from libra.mgm.controllers.root import PoolMgmController
from libra.openstack.common import log from libra.openstack.common import log
@@ -24,41 +27,49 @@ from libra.openstack.common import log
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
def handler(job): def handler(worker, job):
LOG.debug("Received JSON message: {0}".format(json.dumps(job.arguments))) LOG.debug("Received JSON message: {0}".format(json.dumps(job.data)))
controller = PoolMgmController(json.loads(job.arguments)) controller = PoolMgmController(job.data)
response = controller.run() response = controller.run()
LOG.debug("Return JSON message: {0}".format(json.dumps(response))) LOG.debug("Return JSON message: {0}".format(json.dumps(response)))
job.sendWorkComplete(json.dumps(response)) return response
def worker_thread(): def worker_thread():
LOG.info("Registering task libra_pool_mgm") LOG.info("Registering task libra_pool_mgm")
hostname = socket.gethostname() hostname = socket.gethostname()
worker = gear.Worker(hostname) server_list = []
for host_port in cfg.CONF['gearman']['servers']: for host_port in cfg.CONF['gearman']['servers']:
host, port = host_port.split(':') host, port = host_port.split(':')
worker.addServer(host, server_list.append({'host': host,
int(port), 'port': int(port),
cfg.CONF['gearman']['ssl_key'], 'keyfile': cfg.CONF['gearman']['ssl_key'],
cfg.CONF['gearman']['ssl_cert'], 'certfile': cfg.CONF['gearman']['ssl_cert'],
cfg.CONF['gearman']['ssl_ca']) 'ca_certs': cfg.CONF['gearman']['ssl_ca'],
worker.registerFunction('libra_pool_mgm') 'keepalive': cfg.CONF['gearman']['keepalive'],
'keepcnt': cfg.CONF['gearman']['keepcnt'],
'keepidle': cfg.CONF['gearman']['keepidle'],
'keepintvl': cfg.CONF['gearman']['keepintvl']})
worker = JSONGearmanWorker(server_list)
worker.set_client_id(hostname)
worker.register_task('libra_pool_mgm', handler)
worker.logger = LOG worker.logger = LOG
retry = True retry = True
while retry: while (retry):
try: try:
job = worker.getJob() worker.work(cfg.CONF['gearman']['poll'])
handler(job)
except KeyboardInterrupt: except KeyboardInterrupt:
retry = False retry = False
except Exception as e: except gearman.errors.ServerUnavailable:
LOG.exception("Exception in pool manager worker: %s, %s" LOG.error("Job server(s) went away. Reconnecting.")
% (e.__class__, e)) time.sleep(cfg.CONF['gearman']['reconnect_sleep'])
retry = True
except Exception:
LOG.exception("Exception in worker")
retry = False retry = False
LOG.debug("Pool manager process terminated.") LOG.debug("Pool manager process terminated.")