monasca-transform/monasca_transform/transform/grouping/group_sort_by_timestamp_par...

228 lines
9.3 KiB
Python

# Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from monasca_transform.transform.grouping import Grouping
from monasca_transform.transform.grouping import GroupingResults
from monasca_transform.transform.grouping import RecordStoreWithGroupBy
class GroupSortbyTimestampPartition(Grouping):
@staticmethod
def log_debug(logStr):
print(str)
# LOG.debug(logStr)
@staticmethod
def _get_group_first_last_quantity_udf(partition_list_iter):
"""User defined function to go through a list of partitions.
Each partition contains elements for a group. All the elements are sorted by
timestamp.
The stats include first row key, first_event_timestamp,
fist event quantity, last_event_timestamp and last event quantity
"""
first_row = None
last_row = None
count = 0.0
for row in partition_list_iter:
# set the first row
if first_row is None:
first_row = row
# set the last row
last_row = row
count = count + 1
first_event_timestamp_unix = None
first_event_timestamp_string = None
first_event_quantity = None
first_row_key = None
if first_row is not None:
first_event_timestamp_unix = first_row[1].event_timestamp_unix
first_event_timestamp_string = first_row[1].event_timestamp_string
first_event_quantity = first_row[1].event_quantity
# extract the grouping_key from composite grouping_key
# composite grouping key is a list, where first item is the
# grouping key and second item is the event_timestamp_string
first_row_key = first_row[0][0]
last_event_timestamp_unix = None
last_event_timestamp_string = None
last_event_quantity = None
if last_row is not None:
last_event_timestamp_unix = last_row[1].event_timestamp_unix
last_event_timestamp_string = last_row[1].event_timestamp_string
last_event_quantity = last_row[1].event_quantity
results_dict = {"firstrecord_timestamp_unix":
first_event_timestamp_unix,
"firstrecord_timestamp_string":
first_event_timestamp_string,
"firstrecord_quantity": first_event_quantity,
"lastrecord_timestamp_unix":
last_event_timestamp_unix,
"lastrecord_timestamp_string":
last_event_timestamp_string,
"lastrecord_quantity": last_event_quantity,
"record_count": count}
first_row_key_dict = Grouping._parse_grouping_key(first_row_key)
yield [GroupingResults(first_row_key, results_dict,
first_row_key_dict)]
@staticmethod
def _prepare_for_group_by(record_store_with_group_by_rdd):
"""Creates a new rdd where:
The first element of each row contains array of grouping
key and event timestamp fields.
Grouping key and event timestamp fields are used by
partitioning and sorting function to partition the data
by grouping key and then sort the elements in a group by the
timestamp
"""
# get the record store data and group by columns
record_store_data = record_store_with_group_by_rdd.record_store_data
group_by_columns_list = \
record_store_with_group_by_rdd.group_by_columns_list
# construct a group by key
# key1=value1^key2=value2^...
group_by_key_value = ""
for gcol in group_by_columns_list:
group_by_key_value = \
"^".join((group_by_key_value,
"=".join((gcol, eval(".".join(("record_store_data",
gcol)))))))
# return a key-value rdd
# key is a composite key which consists of grouping key and
# event_timestamp_string
return [[group_by_key_value,
record_store_data.event_timestamp_string], record_store_data]
@staticmethod
def _get_partition_by_group(group_composite):
"""Get a hash of the grouping key,
which is then used by partitioning
function to get partition where the groups data should end up in.
It uses hash % num_partitions to get partition
"""
# FIXME: find out of hash function in python gives same value on
# different machines
# Look at using portable_hash method in spark rdd
grouping_key = group_composite[0]
grouping_key_hash = hash(grouping_key)
# log_debug("group_by_sort_by_timestamp_partition: got hash : %s" \
# % str(returnhash))
return grouping_key_hash
@staticmethod
def _sort_by_timestamp(group_composite):
"""get timestamp which will be used to sort grouped data"""
event_timestamp_string = group_composite[1]
return event_timestamp_string
@staticmethod
def _group_sort_by_timestamp_partition(record_store_df,
group_by_columns_list,
num_of_groups):
"""It does a group by and then sorts all the items within the group by event timestamp."""
# convert the dataframe rdd to normal rdd and add the group by
# column list
record_store_with_group_by_rdd = record_store_df.rdd.\
map(lambda x: RecordStoreWithGroupBy(x, group_by_columns_list))
# prepare the data for repartitionAndSortWithinPartitions function
record_store_rdd_prepared = \
record_store_with_group_by_rdd.\
map(GroupSortbyTimestampPartition._prepare_for_group_by)
# repartition data based on a grouping key and sort the items within
# group by timestamp
# give high number of partitions
# numPartitions > number of groups expected, so that each group gets
# allocated a separate partition
record_store_rdd_partitioned_sorted = \
record_store_rdd_prepared.\
repartitionAndSortWithinPartitions(
numPartitions=num_of_groups,
partitionFunc=GroupSortbyTimestampPartition.
_get_partition_by_group,
keyfunc=GroupSortbyTimestampPartition.
_sort_by_timestamp)
return record_store_rdd_partitioned_sorted
@staticmethod
def _remove_none_filter(row):
"""remove any rows which have None as grouping key
[GroupingResults(grouping_key="key1", results={})] rows get created
when partition does not get any grouped data assigned to it
"""
if len(row[0].results) > 0 and row[0].grouping_key is not None:
return row
@staticmethod
def fetch_group_first_last_quantity(record_store_df,
transform_spec_df,
group_by_columns_list,
num_of_groups):
"""Function to group record store data
Sort by timestamp within group
and get first and last timestamp along with quantity within each group
To do group by it uses custom partitioning function which creates a new
partition for each group and uses RDD's repartitionAndSortWithinPartitions
function to do the grouping and sorting within the group.
This is more scalable than just using RDD's group_by as using this
technique group is not materialized into a list and stored in memory, but rather
it uses RDD's in built partitioning capability to do the sort num_of_groups should
be more than expected groups, otherwise the same
partition can get used for two groups which will cause incorrect results.
"""
# group and order elements in group using repartition
record_store_grouped_data_rdd = \
GroupSortbyTimestampPartition.\
_group_sort_by_timestamp_partition(record_store_df,
group_by_columns_list,
num_of_groups)
# do some operations on all elements in the group
grouping_results_tuple_with_none = \
record_store_grouped_data_rdd.\
mapPartitions(GroupSortbyTimestampPartition.
_get_group_first_last_quantity_udf)
# filter all rows which have no data (where grouping key is None) and
# convert resuts into grouping results tuple
grouping_results_tuple1 = grouping_results_tuple_with_none.\
filter(GroupSortbyTimestampPartition._remove_none_filter)
grouping_results_tuple = grouping_results_tuple1.map(lambda x: x[0])
return grouping_results_tuple