performance-docs/doc/source/test_results/keystone/parse_json_res.py

import argparse
import collections
import copy
import itertools
import json
import numpy as np
import os
import prettytable


NODES_TEMPLATE = {
    "DB queries": {
        "total": 0,
        "total_time_spent": 0,
        "select1": 0,
        "select1_time_spent": 0,
        "real": 0,
        "real_time_spent": 0,
        "SELECT": {
            "total": 0,
            "INNER JOIN": 0,
            "LEFT JOIN": 0,
            "RIGHT JOIN": 0,
            "FULL JOIN": 0,
            "time_spent": collections.OrderedDict(),
        },
        "INSERT": {
            "total": 0,
            "time_spent": collections.OrderedDict(),
        },
        "UPDATE": {
            "total": 0,
            "time_spent": collections.OrderedDict(),
        },
        "DELETE": {
            "total": 0,
            "time_spent": collections.OrderedDict(),
        },
        "red_flag": {
            "2joins": {
                "total": 0,
                "queries": [],
                "time_spent": collections.OrderedDict(),
            },
            "3+joins": {
                "total": 0,
                "queries": [],
                "time_spent": collections.OrderedDict(),
            }
        }
    },
    "Memcache cached operations": {},
    "Context cached operations": {},
    "Cached time spent": collections.OrderedDict(),
}

NODES = copy.deepcopy(NODES_TEMPLATE)
OUTLIER_QUERIES = {}


def define_node(node):
    if node["info"]["project"] != "keystone":
        return

    if node["info"]["name"] not in ["db", "cache", "memoize"]:
        return

    time_spent = node["info"]["finished"] - node["info"]["started"]

    if node["info"]["name"] == "db":
        process_db_calls(node, time_spent)
    elif node["info"]["name"] == "cache":
        if not node["children"]:
            process_cache_calls(node, time_spent, "cache")
        else:
            for child in node["children"]:
                define_node(child)
    elif node["info"]["name"] == "memoize":
        if not node["children"] or len(node["children"]) == 1:
            process_cache_calls(node, time_spent)
        else:
            for child in node["children"]:
                define_node(child)


def process_cache_calls(node, time_spent, prefix="memoize"):
    cache_info = node["info"]["meta.raw_payload.%s-start" % prefix][
        "info"]["fn_info"]

    def add_info(pref):
        if not NODES["%s cached operations" % pref].get(cache_info):
            NODES["%s cached operations" % pref][cache_info] = 0
        NODES["%s cached operations" % pref][cache_info] += 1
        if not NODES["Cached time spent"].get(time_spent):
            NODES["Cached time spent"][time_spent] = []
        NODES["Cached time spent"][time_spent].append(cache_info)

    # Liberty env, all cache is Memcached Cache
    if not node["children"]:
        add_info("Memcache")
        return

    # Mitaka env with nested 2-layer cache
    ctxt_cache_info = node["children"][0]
    if ctxt_cache_info["children"]:
        memcache_cache_info = ctxt_cache_info["children"][0]
        if not memcache_cache_info["children"]:
            add_info("Memcache")
    else:
        add_info("Context")


def process_db_calls(node, time_spent):
    NODES["DB queries"]["total"] += 1
    NODES["DB queries"]["total_time_spent"] += time_spent
    statement = node[
        "info"]["meta.raw_payload.db-start"]["info"]["db"]["statement"]
    if statement.startswith("SELECT 1"):
        NODES["DB queries"]["select1"] += 1
        NODES["DB queries"]["select1_time_spent"] += time_spent
    else:
        NODES["DB queries"]["real"] += 1
        NODES["DB queries"]["real_time_spent"] += time_spent

        if statement.startswith("SELECT"):
            process_selects(statement, time_spent)
        elif statement.startswith("UPDATE"):
            process_base_db_calls("UPDATE", statement, time_spent)
        elif statement.startswith("INSERT"):
            process_base_db_calls("INSERT", statement, time_spent)
        elif statement.startswith("DELETE"):
            process_base_db_calls("DELETE", statement, time_spent)


def process_base_db_calls(command, statement, time_spent):
    NODES["DB queries"][command]["total"] += 1
    if not NODES["DB queries"][command]["time_spent"].get(time_spent):
        NODES["DB queries"][command]["time_spent"][time_spent] = []
    NODES["DB queries"][command]["time_spent"][time_spent].append(
        statement)


def process_selects(statement, time_spent):
    process_base_db_calls("SELECT", statement, time_spent)

    ij = statement.count("INNER JOIN")
    lj = statement.count("LEFT JOIN")
    rj = statement.count("RIGHT JOIN")
    fj = statement.count("FULL JOIN")
    NODES["DB queries"]["SELECT"]["INNER JOIN"] += ij
    NODES["DB queries"]["SELECT"]["LEFT JOIN"] += lj
    NODES["DB queries"]["SELECT"]["RIGHT JOIN"] += rj
    NODES["DB queries"]["SELECT"]["FULL JOIN"] += fj

    # raise red flags if too many JOINS met
    if ij + lj + rj + fj == 2:
        NODES["DB queries"]["red_flag"]["2joins"]["total"] += 1
        NODES["DB queries"]["red_flag"]["2joins"][
            "queries"].append(statement)
        if not NODES["DB queries"]["red_flag"]["2joins"][
                "time_spent"].get(time_spent):
            NODES["DB queries"]["red_flag"]["2joins"]["time_spent"][
                time_spent] = []
        NODES["DB queries"]["red_flag"]["2joins"]["time_spent"][
            time_spent].append(statement)
    elif ij + lj + rj + fj >= 3:
        NODES["DB queries"]["red_flag"]["3+joins"]["total"] += 1
        NODES["DB queries"]["red_flag"]["3+joins"][
            "queries"].append(statement)
        if not NODES["DB queries"]["red_flag"]["3+joins"][
                "time_spent"].get(time_spent):
            NODES["DB queries"]["red_flag"]["3+joins"]["time_spent"][
                time_spent] = []
        NODES["DB queries"]["red_flag"]["3+joins"]["time_spent"][
            time_spent].append(statement)


def define_nodes(data):
    for child in data["children"]:
        if not child["children"] or child["info"]["name"] == "memoize":
            define_node(child)
        else:
            define_nodes(child)


def sort_dicts(dictionary):
    new_nodes = copy.deepcopy(dictionary)
    for key in ["SELECT", "INSERT", "DELETE", "UPDATE"]:
        new_nodes["DB queries"][key]["time_spent"] = \
            sum([k*len(v) for k, v
                 in dictionary["DB queries"][key]["time_spent"].iteritems()])
    for key in ["2joins", "3+joins"]:
        new_nodes["DB queries"]["red_flag"][key]["time_spent"] = \
            sum([k*len(v) for k, v
                 in dictionary["DB queries"]["red_flag"][key][
                     "time_spent"].iteritems()])
    new_nodes["Cached time spent"] = \
        sum([k*len(v) for k, v
             in dictionary["Cached time spent"].iteritems()])
    return new_nodes


def detect_outliers(data, m=2.):
    full_time_set = list(itertools.chain(*[[k] * len(v) for k, v
                                           in data.iteritems()]))
    dat = np.abs(full_time_set - np.median(full_time_set))
    mdev = np.median(dat)
    sss = dat/mdev if mdev else 0.
    if mdev:
        for idx, val in enumerate((sss < m).tolist()):
            if not val:
                for query in data[full_time_set[idx]]:
                    OUTLIER_QUERIES[query] = full_time_set[idx]


def prepare_tables(nodes):
    # prepare table with common information
    common_info_table = prettytable.PrettyTable(["**Metric**", "**Value**"])
    common_info_table.align["**Metric**"] = "l"
    common_info_table.align["**Value**"] = "l"
    common_info_table.padding_width = 1
    common_info_table.max_width = 100
    common_info_table.header = True
    common_info_table.hrules = prettytable.ALL

    common_info_table.add_row(["Total (*) Keystone DB queries count",
                               nodes["DB queries"]["total"]])
    common_info_table.add_row(["Total (*) Keystone DB queries time spent, ms",
                               nodes["DB queries"]["total_time_spent"]])
    common_info_table.add_row([
        "Infrastructure (SELECT 1) Keystone DB queries count",
        nodes["DB queries"]["select1"]])
    common_info_table.add_row([
        "Infrastructure (SELECT 1) Keystone DB queries time spent, ms",
        nodes["DB queries"]["select1_time_spent"]])
    common_info_table.add_row(["Real Keystone DB queries count",
                               nodes["DB queries"]["real"]])
    common_info_table.add_row(["Real Keystone DB queries time spent, ms",
                               nodes["DB queries"]["real_time_spent"]])

    db_query_tmpl = "%s\n\n|"

    for key in ["SELECT", "INSERT", "DELETE", "UPDATE"]:
        if nodes["DB queries"][key]["total"]:
            common_info_table.add_row([
                "%s Keystone DB queries count" % key,
                nodes["DB queries"][key]["total"]])
            common_info_table.add_row([
                "%s Keystone DB queries time spent, ms" % key,
                nodes["DB queries"][key]["time_spent"]])
        detect_outliers(NODES["DB queries"][key]["time_spent"])

    # prepare table with outliers information
    outliers_table = prettytable.PrettyTable(["**DB query**",
                                              "**Time spent, ms**"])
    outliers_table.align["**DB query**"] = "l"
    outliers_table.align["**Time spent, ms**"] = "l"
    outliers_table.max_width = 100
    outliers_table.header = True
    outliers_table.hrules = prettytable.ALL

    for query in OUTLIER_QUERIES:
        outliers_table.add_row([db_query_tmpl % query, OUTLIER_QUERIES[query]])

    # prepare table with information about DB requests containing multiple
    # JOIN statements inside
    multi_join_queries = prettytable.PrettyTable(["**DB query**",
                                                  "**Time spent, ms**"])
    multi_join_queries.align["**DB query**"] = "l"
    multi_join_queries.align["**Time spent, ms**"] = "l"
    multi_join_queries.max_width = 100
    multi_join_queries.header = True
    multi_join_queries.hrules = prettytable.ALL

    for key in ["2joins", "3+joins"]:
        for ts in NODES["DB queries"]["red_flag"][key]["time_spent"]:
            for query in NODES["DB queries"]["red_flag"][key][
                    "time_spent"][ts]:
                multi_join_queries.add_row([db_query_tmpl % query, ts])

    # prepare table(s) with cache info
    cache_table = prettytable.PrettyTable(["**Cache**",
                                           "**Cached operations**",
                                           "**args**",
                                           "**kwargs**",
                                           "**Times used**"])

    for cache in ["Memcache cached operations", "Context cached operations"]:
        name_map = {
            "Memcache cached operations": "Memcache",
            "Context cached operations": "Local context",
        }
        cache_table.align["**Cache**"] = "l"
        cache_table.align["**Cached operations**"] = "l"
        cache_table.align["**args**"] = "l"
        cache_table.align["**kwargs**"] = "l"
        cache_table.align["**Times used**"] = "l"
        cache_table.max_width = 100
        cache_table.header = True
        cache_table.hrules = prettytable.ALL

        for operation, times in nodes[cache].iteritems():
            operation = operation[1:-1].split(", ")
            cache_table.add_row([name_map[cache],
                                 operation[0][1:-1],
                                 ", ".join(operation[1:-1])[1:-1],
                                 operation[-1][1:-1], times])

    return common_info_table, multi_join_queries, outliers_table, cache_table


def main():
    parser = argparse.ArgumentParser(description='Process JSON file with '
                                                 'OSprofiler output.')
    parser.add_argument('path', type=str,
                        help='Path to the JSON file / directory with list of '
                             'JSON files with OSprofiler output')

    args = parser.parse_args()
    global NODES
    if os.path.isfile(args.path):
        with open(args.path) as data_file:
            data = json.load(data_file)
            define_nodes(data)
            nodes = sort_dicts(NODES)
            common_info_table, multi_join_queries, outliers_table,\
                cache_table = prepare_tables(nodes)
            print(common_info_table)
            print(outliers_table)
            print(multi_join_queries)
            print(cache_table)
    elif os.path.isdir(args.path):
        for item in os.listdir(args.path):
            if item.endswith(".txt"):
                with open(os.path.join(args.path, item)) as data_file:
                    data = json.load(data_file)
                    NODES = copy.deepcopy(NODES_TEMPLATE)
                    define_nodes(data)
                    nodes = sort_dicts(NODES)
                    common_info_table, multi_join_queries, outliers_table,\
                        cache_table = prepare_tables(nodes)
                    item_name = \
                        item.split(".")[0].replace("_", " ").capitalize() + \
                        " request stats"
                    print(item_name)
                    print(len(item_name) * "~" + "\n")
                    print("**%s**\n" % "Control plane request overlook")
                    print(common_info_table)
                    print("\n**%s**\n" % "Keystone DB queries outliers")
                    print(outliers_table)
                    print("\n**%s**\n" % "Keystone DB queries with multi "
                                         "JOINs inside")
                    print(multi_join_queries)
                    print("\n")
                    print("**Keystone cached methods stats**\n")
                    print(cache_table)
                    print("\n")


if __name__ == "__main__":
    main()