479 lines
21 KiB
Python
479 lines
21 KiB
Python
# Copyright 2016 Hewlett Packard Enterprise Development Company LP
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from collections import namedtuple
|
|
import datetime
|
|
|
|
from pyspark.sql import functions
|
|
from pyspark.sql import SQLContext
|
|
|
|
|
|
from monasca_transform.component import Component
|
|
from monasca_transform.component.component_utils import ComponentUtils
|
|
from monasca_transform.component.usage import UsageComponent
|
|
from monasca_transform.transform.grouping.group_sort_by_timestamp \
|
|
import GroupSortbyTimestamp
|
|
from monasca_transform.transform.grouping.group_sort_by_timestamp_partition \
|
|
import GroupSortbyTimestampPartition
|
|
from monasca_transform.transform.transform_utils import InstanceUsageUtils
|
|
from monasca_transform.transform.transform_utils import RecordStoreUtils
|
|
|
|
import json
|
|
|
|
|
|
class FetchQuantityException(Exception):
|
|
"""Exception thrown when fetching quantity
|
|
|
|
Attributes:
|
|
value: string representing the error
|
|
"""
|
|
|
|
def __init__(self, value):
|
|
self.value = value
|
|
|
|
def __str__(self):
|
|
return repr(self.value)
|
|
|
|
|
|
GroupedDataNamedTuple = namedtuple("GroupedDataWithOperation",
|
|
["grouped_data",
|
|
"usage_fetch_operation",
|
|
"group_by_columns_list"])
|
|
|
|
|
|
class GroupedDataNamedTuple(GroupedDataNamedTuple):
|
|
"""A tuple which is a wrapper containing record store data and the usage operation
|
|
|
|
namdetuple contains:
|
|
|
|
grouped_data - grouped record store data
|
|
usage_fetch_operation - operation to be performed on
|
|
grouped data group_by_columns_list - list of group by columns
|
|
"""
|
|
|
|
|
|
class FetchQuantity(UsageComponent):
|
|
|
|
@staticmethod
|
|
def _supported_fetch_operations():
|
|
return ["sum", "max", "min", "avg", "latest", "oldest"]
|
|
|
|
@staticmethod
|
|
def _is_valid_fetch_operation(operation):
|
|
"""return true if its a valid fetch operation"""
|
|
if operation in FetchQuantity._supported_fetch_operations():
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
@staticmethod
|
|
def _get_latest_oldest_quantity(grouped_data_named_tuple):
|
|
"""Get quantity for each group.
|
|
|
|
By performing the requested usage operation and return a instance usage data.
|
|
"""
|
|
# row
|
|
grouping_results = grouped_data_named_tuple.\
|
|
grouped_data
|
|
|
|
# usage fetch operation
|
|
usage_fetch_operation = grouped_data_named_tuple.\
|
|
usage_fetch_operation
|
|
|
|
# group_by_columns_list
|
|
group_by_columns_list = grouped_data_named_tuple.\
|
|
group_by_columns_list
|
|
|
|
group_by_dict = grouping_results.grouping_key_dict
|
|
|
|
#
|
|
tenant_id = group_by_dict.get("tenant_id",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
resource_uuid = group_by_dict.get("resource_uuid",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
user_id = group_by_dict.get("user_id",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
|
|
geolocation = group_by_dict.get("geolocation",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
region = group_by_dict.get("region",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
zone = group_by_dict.get("zone", Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
host = group_by_dict.get("host", Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
|
|
usage_date = group_by_dict.get("event_date",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
usage_hour = group_by_dict.get("event_hour",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
usage_minute = group_by_dict.get("event_minute",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
|
|
aggregated_metric_name = group_by_dict.get(
|
|
"aggregated_metric_name", Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
|
|
# stats
|
|
agg_stats = grouping_results.results
|
|
|
|
# get quantity for this host
|
|
quantity = None
|
|
if (usage_fetch_operation == "latest"):
|
|
quantity = agg_stats["lastrecord_quantity"]
|
|
elif usage_fetch_operation == "oldest":
|
|
quantity = agg_stats["firstrecord_quantity"]
|
|
|
|
firstrecord_timestamp_unix = agg_stats["firstrecord_timestamp_unix"]
|
|
firstrecord_timestamp_string = \
|
|
agg_stats["firstrecord_timestamp_string"]
|
|
lastrecord_timestamp_unix = agg_stats["lastrecord_timestamp_unix"]
|
|
lastrecord_timestamp_string = agg_stats["lastrecord_timestamp_string"]
|
|
record_count = agg_stats["record_count"]
|
|
|
|
# aggregation period
|
|
aggregation_period = Component.DEFAULT_UNAVAILABLE_VALUE
|
|
|
|
# event type
|
|
event_type = group_by_dict.get("event_type",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
|
|
# add group by fields data to extra data map
|
|
# get existing extra_data_map if any
|
|
extra_data_map = group_by_dict.get("extra_data_map", {})
|
|
for column_name in group_by_columns_list:
|
|
column_value = group_by_dict.get(column_name, Component.
|
|
DEFAULT_UNAVAILABLE_VALUE)
|
|
extra_data_map[column_name] = column_value
|
|
|
|
instance_usage_dict = {"tenant_id": tenant_id, "user_id": user_id,
|
|
"resource_uuid": resource_uuid,
|
|
"geolocation": geolocation, "region": region,
|
|
"zone": zone, "host": host,
|
|
"aggregated_metric_name":
|
|
aggregated_metric_name,
|
|
"quantity": quantity,
|
|
"firstrecord_timestamp_unix":
|
|
firstrecord_timestamp_unix,
|
|
"firstrecord_timestamp_string":
|
|
firstrecord_timestamp_string,
|
|
"lastrecord_timestamp_unix":
|
|
lastrecord_timestamp_unix,
|
|
"lastrecord_timestamp_string":
|
|
lastrecord_timestamp_string,
|
|
"record_count": record_count,
|
|
"usage_date": usage_date,
|
|
"usage_hour": usage_hour,
|
|
"usage_minute": usage_minute,
|
|
"aggregation_period": aggregation_period,
|
|
"processing_meta": {"event_type": event_type},
|
|
"extra_data_map": extra_data_map
|
|
}
|
|
instance_usage_data_json = json.dumps(instance_usage_dict)
|
|
|
|
return instance_usage_data_json
|
|
|
|
@staticmethod
|
|
def _get_quantity(grouped_data_named_tuple):
|
|
|
|
# row
|
|
row = grouped_data_named_tuple.grouped_data
|
|
|
|
# usage fetch operation
|
|
usage_fetch_operation = grouped_data_named_tuple.\
|
|
usage_fetch_operation
|
|
|
|
# group by columns list
|
|
|
|
group_by_columns_list = grouped_data_named_tuple.\
|
|
group_by_columns_list
|
|
|
|
# first record timestamp # FIXME: beginning of epoch?
|
|
earliest_record_timestamp_unix = getattr(
|
|
row, "min(event_timestamp_unix_for_min)",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
earliest_record_timestamp_string = \
|
|
datetime.datetime.utcfromtimestamp(
|
|
earliest_record_timestamp_unix).strftime(
|
|
'%Y-%m-%d %H:%M:%S')
|
|
|
|
# last record_timestamp # FIXME: beginning of epoch?
|
|
latest_record_timestamp_unix = getattr(
|
|
row, "max(event_timestamp_unix_for_max)",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)
|
|
latest_record_timestamp_string = \
|
|
datetime.datetime.utcfromtimestamp(
|
|
latest_record_timestamp_unix).strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
# record count
|
|
record_count = getattr(row, "count(event_timestamp_unix)", 0.0)
|
|
|
|
# quantity
|
|
# get expression that will be used to select quantity
|
|
# from rolled up data
|
|
select_quant_str = "".join((usage_fetch_operation, "(event_quantity)"))
|
|
quantity = getattr(row, select_quant_str, 0.0)
|
|
|
|
# create a column name, value pairs from grouped data
|
|
extra_data_map = InstanceUsageUtils.grouped_data_to_map(row,
|
|
group_by_columns_list)
|
|
|
|
# convert column names, so that values can be accessed by components
|
|
# later in the pipeline
|
|
extra_data_map = InstanceUsageUtils.prepare_extra_data_map(extra_data_map)
|
|
|
|
# create a new instance usage dict
|
|
instance_usage_dict = {"tenant_id": getattr(row, "tenant_id",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"user_id":
|
|
getattr(row, "user_id",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"resource_uuid":
|
|
getattr(row, "resource_uuid",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"geolocation":
|
|
getattr(row, "geolocation",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"region":
|
|
getattr(row, "region",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"zone":
|
|
getattr(row, "zone",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"host":
|
|
getattr(row, "host",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"project_id":
|
|
getattr(row, "tenant_id",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"aggregated_metric_name":
|
|
getattr(row, "aggregated_metric_name",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"quantity":
|
|
quantity,
|
|
"firstrecord_timestamp_unix":
|
|
earliest_record_timestamp_unix,
|
|
"firstrecord_timestamp_string":
|
|
earliest_record_timestamp_string,
|
|
"lastrecord_timestamp_unix":
|
|
latest_record_timestamp_unix,
|
|
"lastrecord_timestamp_string":
|
|
latest_record_timestamp_string,
|
|
"record_count": record_count,
|
|
"usage_date":
|
|
getattr(row, "event_date",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"usage_hour":
|
|
getattr(row, "event_hour",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"usage_minute":
|
|
getattr(row, "event_minute",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"aggregation_period":
|
|
getattr(row, "aggregation_period",
|
|
Component.
|
|
DEFAULT_UNAVAILABLE_VALUE),
|
|
"processing_meta": {"event_type": getattr(
|
|
row, "event_type",
|
|
Component.DEFAULT_UNAVAILABLE_VALUE)},
|
|
"extra_data_map": extra_data_map
|
|
}
|
|
|
|
instance_usage_data_json = json.dumps(instance_usage_dict)
|
|
return instance_usage_data_json
|
|
|
|
@staticmethod
|
|
def usage(transform_context, record_store_df):
|
|
"""Method to return the latest quantity as an instance usage dataframe:
|
|
|
|
It groups together record store records by
|
|
provided group by columns list , sorts within the group by event
|
|
timestamp field, applies group stats udf and returns the latest
|
|
quantity as an instance usage dataframe
|
|
"""
|
|
transform_spec_df = transform_context.transform_spec_df_info
|
|
|
|
# get rollup operation (sum, max, avg, min)
|
|
agg_params = transform_spec_df.select(
|
|
"aggregation_params_map.usage_fetch_operation").\
|
|
collect()[0].asDict()
|
|
usage_fetch_operation = agg_params["usage_fetch_operation"]
|
|
|
|
instance_usage_df = FetchQuantity.usage_by_operation(
|
|
transform_context, record_store_df, usage_fetch_operation)
|
|
|
|
return instance_usage_df
|
|
|
|
@staticmethod
|
|
def usage_by_operation(transform_context, record_store_df,
|
|
usage_fetch_operation):
|
|
"""Returns the latest quantity as a instance usage dataframe
|
|
|
|
It groups together record store records by
|
|
provided group by columns list , sorts within the group by event
|
|
timestamp field, applies group stats udf and returns the latest
|
|
quantity as an instance usage dataframe
|
|
"""
|
|
transform_spec_df = transform_context.transform_spec_df_info
|
|
|
|
# check if operation is valid
|
|
if not FetchQuantity. \
|
|
_is_valid_fetch_operation(usage_fetch_operation):
|
|
raise FetchQuantityException(
|
|
"Operation %s is not supported" % usage_fetch_operation)
|
|
|
|
# get aggregation period
|
|
agg_params = transform_spec_df.select(
|
|
"aggregation_params_map.aggregation_period").collect()[0].asDict()
|
|
aggregation_period = agg_params["aggregation_period"]
|
|
group_by_period_list = ComponentUtils._get_group_by_period_list(
|
|
aggregation_period)
|
|
|
|
# retrieve filter specifications
|
|
agg_params = transform_spec_df.select(
|
|
"aggregation_params_map.filter_by_list"). \
|
|
collect()[0].asDict()
|
|
filter_by_list = \
|
|
agg_params["filter_by_list"]
|
|
|
|
# if filter(s) have been specified, apply them one at a time
|
|
if filter_by_list:
|
|
for filter_element in filter_by_list:
|
|
field_to_filter = filter_element["field_to_filter"]
|
|
filter_expression = filter_element["filter_expression"]
|
|
filter_operation = filter_element["filter_operation"]
|
|
|
|
if (field_to_filter and
|
|
filter_expression and
|
|
filter_operation and
|
|
(filter_operation == "include" or
|
|
filter_operation == "exclude")):
|
|
if filter_operation == "include":
|
|
match = True
|
|
else:
|
|
match = False
|
|
# apply the specified filter to the record store
|
|
record_store_df = record_store_df.where(
|
|
functions.col(str(field_to_filter)).rlike(
|
|
str(filter_expression)) == match)
|
|
else:
|
|
raise FetchQuantityException(
|
|
"Encountered invalid filter details: "
|
|
"field to filter = %s, filter expression = %s, "
|
|
"filter operation = %s. All values must be "
|
|
"supplied and filter operation must be either "
|
|
"'include' or 'exclude'." % (field_to_filter,
|
|
filter_expression,
|
|
filter_operation))
|
|
|
|
# get what we want to group by
|
|
agg_params = transform_spec_df.select(
|
|
"aggregation_params_map.aggregation_group_by_list"). \
|
|
collect()[0].asDict()
|
|
aggregation_group_by_list = agg_params["aggregation_group_by_list"]
|
|
|
|
# group by columns list
|
|
group_by_columns_list = group_by_period_list + \
|
|
aggregation_group_by_list
|
|
|
|
# prepare group by columns list
|
|
group_by_columns_list = RecordStoreUtils.prepare_recordstore_group_by_list(
|
|
group_by_columns_list)
|
|
|
|
instance_usage_json_rdd = None
|
|
if (usage_fetch_operation == "latest" or
|
|
usage_fetch_operation == "oldest"):
|
|
|
|
grouped_rows_rdd = None
|
|
|
|
# FIXME:
|
|
# select group by method
|
|
IS_GROUP_BY_PARTITION = False
|
|
|
|
if (IS_GROUP_BY_PARTITION):
|
|
# GroupSortbyTimestampPartition is a more scalable
|
|
# since it creates groups using repartitioning and sorting
|
|
# but is disabled
|
|
|
|
# number of groups should be more than what is expected
|
|
# this might be hard to guess. Setting this to a very
|
|
# high number is adversely affecting performance
|
|
num_of_groups = 100
|
|
grouped_rows_rdd = \
|
|
GroupSortbyTimestampPartition. \
|
|
fetch_group_latest_oldest_quantity(
|
|
record_store_df, transform_spec_df,
|
|
group_by_columns_list,
|
|
num_of_groups)
|
|
else:
|
|
# group using key-value pair RDD's groupByKey()
|
|
grouped_rows_rdd = \
|
|
GroupSortbyTimestamp. \
|
|
fetch_group_latest_oldest_quantity(
|
|
record_store_df, transform_spec_df,
|
|
group_by_columns_list)
|
|
|
|
grouped_data_rdd_with_operation = grouped_rows_rdd.map(
|
|
lambda x:
|
|
GroupedDataNamedTuple(x,
|
|
str(usage_fetch_operation),
|
|
group_by_columns_list))
|
|
|
|
instance_usage_json_rdd = \
|
|
grouped_data_rdd_with_operation.map(
|
|
FetchQuantity._get_latest_oldest_quantity)
|
|
else:
|
|
record_store_df_int = \
|
|
record_store_df.select(
|
|
record_store_df.event_timestamp_unix.alias(
|
|
"event_timestamp_unix_for_min"),
|
|
record_store_df.event_timestamp_unix.alias(
|
|
"event_timestamp_unix_for_max"),
|
|
"*")
|
|
|
|
# for standard sum, max, min, avg operations on grouped data
|
|
agg_operations_map = {
|
|
"event_quantity": str(usage_fetch_operation),
|
|
"event_timestamp_unix_for_min": "min",
|
|
"event_timestamp_unix_for_max": "max",
|
|
"event_timestamp_unix": "count"}
|
|
|
|
# do a group by
|
|
grouped_data = record_store_df_int.groupBy(*group_by_columns_list)
|
|
grouped_record_store_df = grouped_data.agg(agg_operations_map)
|
|
|
|
grouped_data_rdd_with_operation = grouped_record_store_df.rdd.map(
|
|
lambda x:
|
|
GroupedDataNamedTuple(x,
|
|
str(usage_fetch_operation),
|
|
group_by_columns_list))
|
|
|
|
instance_usage_json_rdd = grouped_data_rdd_with_operation.map(
|
|
FetchQuantity._get_quantity)
|
|
|
|
sql_context = SQLContext.getOrCreate(record_store_df.rdd.context)
|
|
instance_usage_df = \
|
|
InstanceUsageUtils.create_df_from_json_rdd(sql_context,
|
|
instance_usage_json_rdd)
|
|
return instance_usage_df
|