Update API stats data collection

Replace the existing implementation of collectApi, which is not ideal
as it relies on reading log data for GET/POST requests, with
implementation from api-stats.py tool. Only total api, db and rabbit
connections stats for services of interest are collected. Individual
service pid stats are not collected since they take up storage space
and add little value.

Gunicorn stats currently represent stats of all gunicorn related
services (e.g. panko-api, aodh-api, keystone-public,
openstack_dashboard). They will be decomposed in the subsequent
commit.

Functional tests completed by Mathieu Godin.

Change-Id: I8a27fe3374b57d66e35da937a3a250caf78245d3
Story: 2002895
Task: 22858
Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
This commit is contained in:
Mathieu Godin 2018-07-03 17:02:24 -04:00 committed by Tee Ngo
parent df32881300
commit 1b1a84057d
2 changed files with 63 additions and 54 deletions

View File

@ -47,10 +47,11 @@ netstats=10
postgres=30
rabbitmq=3600
vswitch=120
api_requests=5
[AdditionalOptions]
# Set this option to Y/N to enable/disable Openstack API GET/POST collection
API_REQUESTS=N
API_REQUESTS=Y
# Set this option to Y/N to enable/disable the collection of all services and not just the ones listed below. Note that this hasn't been tested thoroughly
ALL_SERVICES=N
@ -75,3 +76,23 @@ RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info
[CommonServices]
COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd ptp4l phc2sys smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd
[StaticServices]
STATIC_SERVICE_LIST=occtop memtop schedtop top.sh iostat.sh netstats.sh diskstats.sh memstats.sh filestats.sh ceph.sh postgres.sh rabbitmq.sh vswitch.sh
[OpenStackServices]
OPEN_STACK_SERVICE_LIST=nova cinder aodh ceilometer heat glance ceph horizon keystone puppet sysinv neutron nova_api postgres panko nova_cell0 magnum ironic murano gnocchi
[SkipList]
SKIP_LIST=ps top sh <defunct> curl awk wc sleep lsof cut grep ip tail su
[ExcludeList]
EXCLUDE_LIST=python python2 bash perl sudo init
[ApiStatsConstantPorts]
DB_PORT_NUMBER=5432
RABBIT_PORT_NUMBER=5672
[ApiStatsServices]
API_STATS_STRUCTURE=gunicorn;gunicorn;5000|sysinv-conductor;sysinv-co ;|neutron-server;neutron-s;9696|nova-conductor;nova-cond ;|sysinv-agent;sysinv-ag;|sysinv-api;sysinv-ap;6385|nova-api;nova-api ;18774|cinder-api;cinder-a;8776|glance-api;glance-a;9292|ceilometer;ceilomete;8777|vim;nfv-vim;4545|heat-api;heat-a;8004|heat-engine;heat-e;8004

View File

@ -14,6 +14,8 @@ import psutil
import fcntl
import logging
import ConfigParser
import itertools
import six
from multiprocessing import Process, cpu_count
from subprocess import Popen, PIPE
from collections import OrderedDict
@ -1114,60 +1116,37 @@ def collectCpuCount(influx_info, node, ci):
except Exception:
logging.error("cpu_count collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info()))
def countApiStatsServices(lsof_lines, service_port, service_name):
service_count = 0
for line in lsof_lines:
if service_port is not None and service_name is not None and service_port in line and service_name in line:
service_count += 1
return service_count
# collect API GET and POST requests/sec
def collectApi(influx_info, node, ci, openstack_svcs):
def collectApiStats(influx_info, node, ci, services):
logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO)
logging.info("api_request data starting collection with a collection interval of {}s".format(ci["cpu_count"]))
measurement = "api_requests"
tags = {"node": node}
openstack_services = openstack_svcs
influx_string = ""
lsof_args = ['lsof', '-Pn', '-i', 'tcp']
while True:
try:
fields = {}
tmp = {}
tmp1 = {}
# get initial values
for s in openstack_services:
fields[s] = {"get": 0, "post": 0}
tmp[s] = {"get": 0, "post": 0}
log = "/var/log/{0}/{0}-api.log".format(s)
if os.path.exists(log):
if s == "ceilometer":
p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE)
else:
p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE)
init_api_get = int(p.stdout.readline())
tmp[s]["get"] = init_api_get
p.kill()
p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE)
init_api_post = int(p.stdout.readline())
tmp[s]["post"] = init_api_post
p.kill()
time.sleep(1)
# get new values
for s in openstack_services:
tmp1[s] = {"get": 0, "post": 0}
log = "/var/log/{0}/{0}-api.log".format(s)
if os.path.exists(log):
if s == "ceilometer":
p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE)
else:
p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE)
api_get = int(p.stdout.readline())
tmp1[s]["get"] = api_get
p.kill()
p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE)
api_post = int(p.stdout.readline())
tmp1[s]["post"] = api_post
p.kill()
# take difference
for key in fields:
if (key in tmp and key in tmp1) and (tmp1[key]["get"] >= tmp[key]["get"]) and (tmp1[key]["post"] >= tmp[key]["post"]):
fields[key]["get"] = (tmp1[key]["get"] - tmp[key]["get"])
fields[key]["post"] = (tmp1[key]["post"] - tmp[key]["post"])
influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", key, "get_requests", fields[key]["get"], "post_requests", fields[key]["post"]) + "\n"
fields = {}
lsof_result = Popen(lsof_args, shell=False, stdout=PIPE)
lsof_lines = list()
while True:
line = lsof_result.stdout.readline().strip("\n")
if not line:
break
lsof_lines.append(line)
lsof_result.kill()
for name, service in services.iteritems():
api_count = countApiStatsServices(lsof_lines, service['api-port'], service['name'])
db_count = countApiStatsServices(lsof_lines, service['db-port'], service['name'])
rabbit_count = countApiStatsServices(lsof_lines, service['rabbit-port'], service['name'])
fields[name] = {"api": api_count, "db": db_count, "rabbit": rabbit_count}
influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", name, "api", fields[name]["api"], "db", fields[name]["db"], "rabbit", fields[name]["rabbit"]) + "\n"
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True)
p.communicate()
influx_string = ""
@ -1177,7 +1156,6 @@ def collectApi(influx_info, node, ci, openstack_svcs):
logging.error("api_request collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info()))
time.sleep(3)
# returns the cores dedicated to platform use
def getPlatformCores(node, cpe):
if cpe is True or node.startswith("compute"):
@ -1347,12 +1325,7 @@ if __name__ == "__main__":
common_services = list()
services = {}
live_svc = ("live_stream.py",)
static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh")
collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None}
openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi")
# memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff
exclude_list = ("python", "python2", "bash", "perl", "sudo", "init")
skip_list = ("ps", "top", "sh", "<defunct>", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su")
duration = None
unconverted_duration = ""
collect_api_requests = False
@ -1423,12 +1396,27 @@ if __name__ == "__main__":
storage_services = tuple(config.get("StorageServices", "STORAGE_SERVICE_LIST").split())
rabbit_services = tuple(config.get("RabbitmqServices", "RABBITMQ_QUEUE_LIST").split())
common_services = tuple(config.get("CommonServices", "COMMON_SERVICE_LIST").split())
static_svcs = tuple(config.get("StaticServices", "STATIC_SERVICE_LIST").split())
openstack_services = tuple(config.get("OpenStackServices", "OPEN_STACK_SERVICE_LIST").split())
skip_list = tuple(config.get("SkipList", "SKIP_LIST").split())
exclude_list = tuple(config.get("ExcludeList", "EXCLUDE_LIST").split())
# get collection intervals
for i in config.options("Intervals"):
if config.get("Intervals", i) == "" or config.get("Intervals", i) is None:
collection_intervals[i] = None
else:
collection_intervals[i] = int(config.get("Intervals", i))
# get api-stats services
DB_PORT_NUMBER = config.get("ApiStatsConstantPorts", "DB_PORT_NUMBER")
RABBIT_PORT_NUMBER = config.get("ApiStatsConstantPorts", "RABBIT_PORT_NUMBER")
SERVICES = OrderedDict()
SERVICES_INFO = tuple(config.get("ApiStatsServices", "API_STATS_STRUCTURE").split('|'))
for service_string in SERVICES_INFO:
service_tuple = tuple(service_string.split(';'))
if service_tuple[2] != "" and service_tuple[2] != None:
SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': service_tuple[2]}
else:
SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': None}
except Exception:
print "An error has occurred when parsing the engtools.conf configuration file: {}".format(sys.exc_info())
sys.exit(0)
@ -1551,7 +1539,7 @@ if __name__ == "__main__":
tasks.append(p)
p.start()
if collect_api_requests is True and node_type == "controller":
p = Process(target=collectApi, args=(influx_info, node, collection_intervals, openstack_services), name="api_requests")
p = Process(target=collectApiStats, args=(influx_info, node, collection_intervals, SERVICES), name="api_requests")
tasks.append(p)
p.start()