monasca-transform/monasca_transform/component/setter/rollup_quantity.py

# Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

from pyspark.sql import SQLContext

import datetime

from monasca_transform.component import Component
from monasca_transform.component.component_utils import ComponentUtils
from monasca_transform.component.setter import SetterComponent
from monasca_transform.transform.transform_utils import InstanceUsageUtils

import json


class RollupQuantityException(Exception):
    """Exception thrown when doing quantity rollup

    Attributes:
    value: string representing the error
    """

    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)


class RollupQuantity(SetterComponent):

    @staticmethod
    def _supported_rollup_operations():
        return ["sum", "max", "min", "avg"]

    @staticmethod
    def _is_valid_rollup_operation(operation):
        if operation in RollupQuantity._supported_rollup_operations():
            return True
        else:
            return False

    @staticmethod
    def _rollup_quantity(instance_usage_df,
                         setter_rollup_group_by_list,
                         setter_rollup_operation):

        instance_usage_data_json_list = []

        # check if operation is valid
        if not RollupQuantity.\
                _is_valid_rollup_operation(setter_rollup_operation):
            raise RollupQuantityException(
                "Operation %s is not supported" % setter_rollup_operation)

        # call required operation on grouped data
        # e.g. sum, max, min, avg etc
        agg_operations_map = {
            "quantity": str(setter_rollup_operation),
            "firstrecord_timestamp_unix": "min",
            "lastrecord_timestamp_unix": "max",
            "record_count": "sum"}

        # do a group by
        grouped_data = instance_usage_df.groupBy(
            *setter_rollup_group_by_list)
        rollup_df = grouped_data.agg(agg_operations_map)

        for row in rollup_df.collect():

            # first record timestamp
            earliest_record_timestamp_unix = getattr(
                row, "min(firstrecord_timestamp_unix)",
                Component.DEFAULT_UNAVAILABLE_VALUE)
            earliest_record_timestamp_string = \
                datetime.datetime.utcfromtimestamp(
                    earliest_record_timestamp_unix).strftime(
                    '%Y-%m-%d %H:%M:%S')

            # last record_timestamp
            latest_record_timestamp_unix = getattr(
                row, "max(lastrecord_timestamp_unix)",
                Component.DEFAULT_UNAVAILABLE_VALUE)
            latest_record_timestamp_string = \
                datetime.datetime.utcfromtimestamp(
                    latest_record_timestamp_unix).strftime('%Y-%m-%d %H:%M:%S')

            # record count
            record_count = getattr(row, "sum(record_count)", 0.0)

            # quantity
            # get expression that will be used to select quantity
            # from rolled up data
            select_quant_str = "".join((setter_rollup_operation, "(quantity)"))
            quantity = getattr(row, select_quant_str, 0.0)

            try:
                processing_meta = row.processing_meta
            except AttributeError:
                processing_meta = {}

            # create a new instance usage dict
            instance_usage_dict = {"tenant_id": getattr(row, "tenant_id",
                                                        "all"),
                                   "user_id":
                                       getattr(row, "user_id", "all"),
                                   "resource_uuid":
                                       getattr(row, "resource_uuid", "all"),
                                   "namespace":
                                       getattr(row, "namespace", "all"),
                                   "pod_name":
                                       getattr(row, "pod_name", "all"),
                                   "app":
                                       getattr(row, "app", "all"),
                                   "container_name":
                                       getattr(row, "container_name", "all"),
                                   "interface":
                                       getattr(row, "interface", "all"),
                                   "deployment":
                                       getattr(row, "deployment", "all"),
                                   "daemon_set":
                                       getattr(row, "daemon_set", "all"),
                                   "geolocation":
                                       getattr(row, "geolocation", "all"),
                                   "region":
                                       getattr(row, "region", "all"),
                                   "zone":
                                       getattr(row, "zone", "all"),
                                   "host":
                                       getattr(row, "host", "all"),
                                   "project_id":
                                       getattr(row, "tenant_id", "all"),
                                   "aggregated_metric_name":
                                       getattr(row, "aggregated_metric_name",
                                               "all"),
                                   "quantity":
                                       quantity,
                                   "firstrecord_timestamp_unix":
                                       earliest_record_timestamp_unix,
                                   "firstrecord_timestamp_string":
                                       earliest_record_timestamp_string,
                                   "lastrecord_timestamp_unix":
                                       latest_record_timestamp_unix,
                                   "lastrecord_timestamp_string":
                                       latest_record_timestamp_string,
                                   "record_count": record_count,
                                   "service_group":
                                       getattr(row, "service_group", "all"),
                                   "service_id":
                                       getattr(row, "service_id", "all"),
                                   "usage_date":
                                       getattr(row, "usage_date", "all"),
                                   "usage_hour":
                                       getattr(row, "usage_hour", "all"),
                                   "usage_minute":
                                       getattr(row, "usage_minute", "all"),
                                   "aggregation_period":
                                       getattr(row, "aggregation_period",
                                               "all"),
                                   "processing_meta": processing_meta
                                   }

            instance_usage_data_json = json.dumps(instance_usage_dict)
            instance_usage_data_json_list.append(instance_usage_data_json)

        # convert to rdd
        spark_context = instance_usage_df.rdd.context
        return spark_context.parallelize(instance_usage_data_json_list)

    @staticmethod
    def setter(transform_context, instance_usage_df):

        transform_spec_df = transform_context.transform_spec_df_info

        # get rollup operation (sum, max, avg, min)
        agg_params = transform_spec_df.select(
            "aggregation_params_map.setter_rollup_operation").\
            collect()[0].asDict()
        setter_rollup_operation = agg_params["setter_rollup_operation"]

        instance_usage_trans_df = RollupQuantity.setter_by_operation(
            transform_context,
            instance_usage_df,
            setter_rollup_operation)

        return instance_usage_trans_df

    @staticmethod
    def setter_by_operation(transform_context, instance_usage_df,
                            setter_rollup_operation):

        transform_spec_df = transform_context.transform_spec_df_info

        # get fields we want to group by for a rollup
        agg_params = transform_spec_df.select(
            "aggregation_params_map.setter_rollup_group_by_list"). \
            collect()[0].asDict()
        setter_rollup_group_by_list = agg_params["setter_rollup_group_by_list"]

        # get aggregation period
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_period").collect()[0].asDict()
        aggregation_period = agg_params["aggregation_period"]
        group_by_period_list = \
            ComponentUtils._get_instance_group_by_period_list(
                aggregation_period)

        # group by columns list
        group_by_columns_list = \
            group_by_period_list + setter_rollup_group_by_list

        # perform rollup operation
        instance_usage_json_rdd = RollupQuantity._rollup_quantity(
            instance_usage_df,
            group_by_columns_list,
            str(setter_rollup_operation))

        sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
        instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
            sql_context,
            instance_usage_json_rdd)

        return instance_usage_trans_df

    @staticmethod
    def do_rollup(setter_rollup_group_by_list,
                  aggregation_period,
                  setter_rollup_operation,
                  instance_usage_df):

        # get aggregation period
        group_by_period_list = \
            ComponentUtils._get_instance_group_by_period_list(
                aggregation_period)

        # group by columns list
        group_by_columns_list = group_by_period_list + \
            setter_rollup_group_by_list

        # perform rollup operation
        instance_usage_json_rdd = RollupQuantity._rollup_quantity(
            instance_usage_df,
            group_by_columns_list,
            str(setter_rollup_operation))

        sql_context = SQLContext.getOrCreate(instance_usage_df.rdd.context)
        instance_usage_trans_df = InstanceUsageUtils.create_df_from_json_rdd(
            sql_context,
            instance_usage_json_rdd)

        return instance_usage_trans_df