monasca-transform/monasca_transform/transform/grouping/group_sort_by_timestamp.py

169 lines
6.2 KiB
Python

# Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from monasca_transform.transform.grouping import Grouping
from monasca_transform.transform.grouping import GroupingResults
from monasca_transform.transform.grouping import RecordStoreWithGroupBy
class GroupSortbyTimestamp(Grouping):
@staticmethod
def log_debug(logStr):
print(str)
# LOG.debug(logStr)
@staticmethod
def _prepare_for_group_by(record_store_with_group_by_rdd):
"""creates a new rdd where:
the first element of each row
contains array of grouping key and event timestamp fields.
Grouping key and event timestamp fields are used by
partitioning and sorting function to partition the data
by grouping key and then sort the elements in a group by the
timestamp
"""
# get the record store data and group by columns
record_store_data = record_store_with_group_by_rdd.record_store_data
group_by_columns_list = \
record_store_with_group_by_rdd.group_by_columns_list
# construct a group by key
# key1=value1^key2=value2^...
group_by_key_value = ""
for gcol in group_by_columns_list:
group_by_key_value = \
"^".join((group_by_key_value,
"=".join((gcol,
eval(".".join(("record_store_data",
gcol)))))))
# return a key-value rdd
return [group_by_key_value, record_store_data]
@staticmethod
def _sort_by_timestamp(result_iterable):
# LOG.debug(whoami(result_iterable.data[0]))
# sort list might cause OOM, if the group has lots of items
# use group_sort_by_timestamp_partitions module instead if you run
# into OOM
sorted_list = sorted(result_iterable.data,
key=lambda row: row.event_timestamp_string)
return sorted_list
@staticmethod
def _group_sort_by_timestamp(record_store_df, group_by_columns_list):
# convert the dataframe rdd to normal rdd and add the group by column
# list
record_store_with_group_by_rdd = record_store_df.rdd.\
map(lambda x: RecordStoreWithGroupBy(x, group_by_columns_list))
# convert rdd into key-value rdd
record_store_with_group_by_rdd_key_val = \
record_store_with_group_by_rdd.\
map(GroupSortbyTimestamp._prepare_for_group_by)
first_step = record_store_with_group_by_rdd_key_val.groupByKey()
record_store_rdd_grouped_sorted = first_step.mapValues(
GroupSortbyTimestamp._sort_by_timestamp)
return record_store_rdd_grouped_sorted
@staticmethod
def _get_group_first_last_quantity_udf(grouplistiter):
"""Return stats that include:
first row key, first_event_timestamp,
first event quantity, last_event_timestamp and last event quantity
"""
first_row = None
last_row = None
# extract key and value list
group_key = grouplistiter[0]
grouped_values = grouplistiter[1]
count = 0.0
for row in grouped_values:
# set the first row
if first_row is None:
first_row = row
# set the last row
last_row = row
count = count + 1
first_event_timestamp_unix = None
first_event_timestamp_string = None
first_event_quantity = None
if first_row is not None:
first_event_timestamp_unix = first_row.event_timestamp_unix
first_event_timestamp_string = first_row.event_timestamp_string
first_event_quantity = first_row.event_quantity
last_event_timestamp_unix = None
last_event_timestamp_string = None
last_event_quantity = None
if last_row is not None:
last_event_timestamp_unix = last_row.event_timestamp_unix
last_event_timestamp_string = last_row.event_timestamp_string
last_event_quantity = last_row.event_quantity
results_dict = {"firstrecord_timestamp_unix":
first_event_timestamp_unix,
"firstrecord_timestamp_string":
first_event_timestamp_string,
"firstrecord_quantity": first_event_quantity,
"lastrecord_timestamp_unix":
last_event_timestamp_unix,
"lastrecord_timestamp_string":
last_event_timestamp_string,
"lastrecord_quantity": last_event_quantity,
"record_count": count}
group_key_dict = Grouping._parse_grouping_key(group_key)
return GroupingResults(group_key, results_dict, group_key_dict)
@staticmethod
def fetch_group_latest_oldest_quantity(record_store_df,
transform_spec_df,
group_by_columns_list):
"""Function to group record store data
Sort by timestamp within group
and get first and last timestamp along with quantity within each group
This function uses key-value pair rdd's groupBy function to do group_by
"""
# group and order elements in group
record_store_grouped_data_rdd = \
GroupSortbyTimestamp._group_sort_by_timestamp(
record_store_df, group_by_columns_list)
# find stats for a group
record_store_grouped_rows = \
record_store_grouped_data_rdd.\
map(GroupSortbyTimestamp.
_get_group_first_last_quantity_udf)
return record_store_grouped_rows