Update API stats data collection

Replace the existing implementation of collectApi, which is not ideal as it relies on reading log data for GET/POST requests, with implementation from api-stats.py tool. Only total api, db and rabbit connections stats for services of interest are collected. Individual service pid stats are not collected since they take up storage space and add little value. Gunicorn stats currently represent stats of all gunicorn related services (e.g. panko-api, aodh-api, keystone-public, openstack_dashboard). They will be decomposed in the subsequent commit. Functional tests completed by Mathieu Godin. Change-Id: I8a27fe3374b57d66e35da937a3a250caf78245d3 Story: 2002895 Task: 22858 Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
2018-07-03 17:02:24 -04:00 · 2018-07-03 17:02:24 -04:00 · 1b1a84057d
parent df32881300
commit 1b1a84057d
2 changed files with 63 additions and 54 deletions
--- a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf
+++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf
@ -47,10 +47,11 @@ netstats=10
 postgres=30
 rabbitmq=3600
 vswitch=120
+api_requests=5

 [AdditionalOptions]
 # Set this option to Y/N to enable/disable Openstack API GET/POST collection
-API_REQUESTS=N
+API_REQUESTS=Y

 # Set this option to Y/N to enable/disable the collection of all services and not just the ones listed below. Note that this hasn't been tested thoroughly
 ALL_SERVICES=N
@ -75,3 +76,23 @@ RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info

 [CommonServices]
 COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd ptp4l phc2sys smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd
+
+[StaticServices]
+STATIC_SERVICE_LIST=occtop memtop schedtop top.sh iostat.sh netstats.sh diskstats.sh memstats.sh filestats.sh ceph.sh postgres.sh rabbitmq.sh vswitch.sh
+
+[OpenStackServices]
+OPEN_STACK_SERVICE_LIST=nova cinder aodh ceilometer heat glance ceph horizon keystone puppet sysinv neutron nova_api postgres panko nova_cell0 magnum ironic murano gnocchi
+
+[SkipList]
+SKIP_LIST=ps top sh <defunct> curl awk wc sleep lsof cut grep ip tail su
+
+[ExcludeList]
+EXCLUDE_LIST=python python2 bash perl sudo init
+
+[ApiStatsConstantPorts]
+DB_PORT_NUMBER=5432
+RABBIT_PORT_NUMBER=5672
+
+[ApiStatsServices]
+API_STATS_STRUCTURE=gunicorn;gunicorn;5000|sysinv-conductor;sysinv-co ;|neutron-server;neutron-s;9696|nova-conductor;nova-cond ;|sysinv-agent;sysinv-ag;|sysinv-api;sysinv-ap;6385|nova-api;nova-api ;18774|cinder-api;cinder-a;8776|glance-api;glance-a;9292|ceilometer;ceilomete;8777|vim;nfv-vim;4545|heat-api;heat-a;8004|heat-engine;heat-e;8004
+
--- a/tools/engtools/hostdata-collectors/scripts/live_stream.py
+++ b/tools/engtools/hostdata-collectors/scripts/live_stream.py
@ -14,6 +14,8 @@ import psutil
 import fcntl
 import logging
 import ConfigParser
+import itertools
+import six
 from multiprocessing import Process, cpu_count
 from subprocess import Popen, PIPE
 from collections import OrderedDict
@ -1114,60 +1116,37 @@ def collectCpuCount(influx_info, node, ci):
        except Exception:
            logging.error("cpu_count collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info()))

+def countApiStatsServices(lsof_lines, service_port, service_name):
+    service_count = 0
+    for line in lsof_lines:
+        if service_port is not None and service_name is not None and service_port in line and service_name in line:
+            service_count += 1
+    return service_count

-# collect API GET and POST requests/sec
-def collectApi(influx_info, node, ci, openstack_svcs):
+def collectApiStats(influx_info, node, ci, services):
    logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO)
    logging.info("api_request data starting collection with a collection interval of {}s".format(ci["cpu_count"]))
    measurement = "api_requests"
    tags = {"node": node}
-    openstack_services = openstack_svcs
    influx_string = ""
+    lsof_args = ['lsof', '-Pn', '-i', 'tcp']
    while True:
        try:
-            fields = {}
-            tmp = {}
-            tmp1 = {}
-            # get initial values
-            for s in openstack_services:
-                fields[s] = {"get": 0, "post": 0}
-                tmp[s] = {"get": 0, "post": 0}
-                log = "/var/log/{0}/{0}-api.log".format(s)
-                if os.path.exists(log):
-                    if s == "ceilometer":
-                        p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE)
-                    else:
-                        p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE)
-                    init_api_get = int(p.stdout.readline())
-                    tmp[s]["get"] = init_api_get
-                    p.kill()
-                    p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE)
-                    init_api_post = int(p.stdout.readline())
-                    tmp[s]["post"] = init_api_post
-                    p.kill()
-            time.sleep(1)
-            # get new values
-            for s in openstack_services:
-                tmp1[s] = {"get": 0, "post": 0}
-                log = "/var/log/{0}/{0}-api.log".format(s)
-                if os.path.exists(log):
-                    if s == "ceilometer":
-                        p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE)
-                    else:
-                        p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE)
-                    api_get = int(p.stdout.readline())
-                    tmp1[s]["get"] = api_get
-                    p.kill()
-                    p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE)
-                    api_post = int(p.stdout.readline())
-                    tmp1[s]["post"] = api_post
-                    p.kill()
-            # take difference
-            for key in fields:
-                if (key in tmp and key in tmp1) and (tmp1[key]["get"] >= tmp[key]["get"]) and (tmp1[key]["post"] >= tmp[key]["post"]):
-                    fields[key]["get"] = (tmp1[key]["get"] - tmp[key]["get"])
-                    fields[key]["post"] = (tmp1[key]["post"] - tmp[key]["post"])
-                    influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", key, "get_requests", fields[key]["get"], "post_requests", fields[key]["post"]) + "\n"
+           fields = {}
+           lsof_result = Popen(lsof_args, shell=False, stdout=PIPE)
+           lsof_lines = list()
+           while True:
+               line = lsof_result.stdout.readline().strip("\n")
+               if not line:
+                   break
+               lsof_lines.append(line)
+           lsof_result.kill()
+           for name, service in services.iteritems():
+                api_count = countApiStatsServices(lsof_lines, service['api-port'], service['name'])
+                db_count = countApiStatsServices(lsof_lines, service['db-port'], service['name'])
+                rabbit_count = countApiStatsServices(lsof_lines, service['rabbit-port'], service['name'])
+                fields[name] = {"api": api_count, "db": db_count, "rabbit": rabbit_count}
+                influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", name, "api", fields[name]["api"], "db", fields[name]["db"], "rabbit", fields[name]["rabbit"]) + "\n"
            p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True)
            p.communicate()
            influx_string = ""
@ -1177,7 +1156,6 @@ def collectApi(influx_info, node, ci, openstack_svcs):
            logging.error("api_request collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info()))
            time.sleep(3)

-
 # returns the cores dedicated to platform use
 def getPlatformCores(node, cpe):
    if cpe is True or node.startswith("compute"):
@ -1347,12 +1325,7 @@ if __name__ == "__main__":
    common_services = list()
    services = {}
    live_svc = ("live_stream.py",)
-    static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh")
    collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None}
-    openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi")
-    # memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff
-    exclude_list = ("python", "python2", "bash", "perl", "sudo", "init")
-    skip_list = ("ps", "top", "sh", "<defunct>", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su")
    duration = None
    unconverted_duration = ""
    collect_api_requests = False
@ -1423,12 +1396,27 @@ if __name__ == "__main__":
        storage_services = tuple(config.get("StorageServices", "STORAGE_SERVICE_LIST").split())
        rabbit_services = tuple(config.get("RabbitmqServices", "RABBITMQ_QUEUE_LIST").split())
        common_services = tuple(config.get("CommonServices", "COMMON_SERVICE_LIST").split())
+        static_svcs = tuple(config.get("StaticServices", "STATIC_SERVICE_LIST").split())
+        openstack_services = tuple(config.get("OpenStackServices", "OPEN_STACK_SERVICE_LIST").split())
+        skip_list = tuple(config.get("SkipList", "SKIP_LIST").split())
+        exclude_list = tuple(config.get("ExcludeList", "EXCLUDE_LIST").split())
        # get collection intervals
        for i in config.options("Intervals"):
            if config.get("Intervals", i) == "" or config.get("Intervals", i) is None:
                collection_intervals[i] = None
            else:
                collection_intervals[i] = int(config.get("Intervals", i))
+        # get api-stats services
+        DB_PORT_NUMBER = config.get("ApiStatsConstantPorts", "DB_PORT_NUMBER")
+        RABBIT_PORT_NUMBER = config.get("ApiStatsConstantPorts", "RABBIT_PORT_NUMBER")
+        SERVICES = OrderedDict()
+        SERVICES_INFO = tuple(config.get("ApiStatsServices", "API_STATS_STRUCTURE").split('|'))
+        for service_string in SERVICES_INFO:
+            service_tuple = tuple(service_string.split(';'))
+            if service_tuple[2] != "" and service_tuple[2] != None:
+                SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': service_tuple[2]}
+            else:
+                SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': None}
    except Exception:
        print "An error has occurred when parsing the engtools.conf configuration file: {}".format(sys.exc_info())
        sys.exit(0)
@ -1551,7 +1539,7 @@ if __name__ == "__main__":
            tasks.append(p)
            p.start()
        if collect_api_requests is True and node_type == "controller":
-            p = Process(target=collectApi, args=(influx_info, node, collection_intervals, openstack_services), name="api_requests")
+            p = Process(target=collectApiStats, args=(influx_info, node, collection_intervals, SERVICES), name="api_requests")
            tasks.append(p)
            p.start()