232 lines
9.3 KiB
Python
232 lines
9.3 KiB
Python
# Copyright 2016 Hewlett Packard Enterprise Development Company LP
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from monasca_transform.transform.grouping import Grouping
|
|
from monasca_transform.transform.grouping import GroupingResults
|
|
from monasca_transform.transform.grouping import RecordStoreWithGroupBy
|
|
|
|
|
|
class GroupSortbyTimestampPartition(Grouping):
|
|
|
|
@staticmethod
|
|
def log_debug(logStr):
|
|
print(str)
|
|
# LOG.debug(logStr)
|
|
|
|
@staticmethod
|
|
def _get_group_first_last_quantity_udf(partitionlistiter):
|
|
"""user defined function to to through a list of partitions.
|
|
|
|
Each partition contains elements for a group. All the elements are sorted by
|
|
timestamp.
|
|
The stats include first row key, first_event_timestamp,
|
|
fist event quantity, last_event_timestamp and last event quantity
|
|
"""
|
|
first_row = None
|
|
last_row = None
|
|
|
|
count = 0.0
|
|
for row in partitionlistiter:
|
|
|
|
# set the first row
|
|
if first_row is None:
|
|
first_row = row
|
|
|
|
# set the last row
|
|
last_row = row
|
|
count = count + 1
|
|
|
|
first_event_timestamp_unix = None
|
|
first_event_timestamp_string = None
|
|
first_event_quantity = None
|
|
first_row_key = None
|
|
if first_row is not None:
|
|
first_event_timestamp_unix = first_row[1].event_timestamp_unix
|
|
first_event_timestamp_string = first_row[1].event_timestamp_string
|
|
first_event_quantity = first_row[1].event_quantity
|
|
|
|
# extract the grouping_key from composite grouping_key
|
|
# composite grouping key is a list, where first item is the
|
|
# grouping key and second item is the event_timestamp_string
|
|
first_row_key = first_row[0][0]
|
|
|
|
last_event_timestamp_unix = None
|
|
last_event_timestamp_string = None
|
|
last_event_quantity = None
|
|
if last_row is not None:
|
|
last_event_timestamp_unix = last_row[1].event_timestamp_unix
|
|
last_event_timestamp_string = last_row[1].event_timestamp_string
|
|
last_event_quantity = last_row[1].event_quantity
|
|
|
|
results_dict = {"firstrecord_timestamp_unix":
|
|
first_event_timestamp_unix,
|
|
"firstrecord_timestamp_string":
|
|
first_event_timestamp_string,
|
|
"firstrecord_quantity": first_event_quantity,
|
|
"lastrecord_timestamp_unix":
|
|
last_event_timestamp_unix,
|
|
"lastrecord_timestamp_string":
|
|
last_event_timestamp_string,
|
|
"lastrecord_quantity": last_event_quantity,
|
|
"record_count": count}
|
|
|
|
first_row_key_dict = Grouping._parse_grouping_key(first_row_key)
|
|
|
|
yield [GroupingResults(first_row_key, results_dict,
|
|
first_row_key_dict)]
|
|
|
|
@staticmethod
|
|
def _prepare_for_groupby(record_store_with_groupby_rdd):
|
|
"""creates a new rdd where:
|
|
|
|
the first element of each row
|
|
contains array of grouping key and event timestamp fields.
|
|
Grouping key and event timestamp fields are used by
|
|
partitioning and sorting function to partition the data
|
|
by grouping key and then sort the elements in a group by the
|
|
timestamp
|
|
"""
|
|
|
|
# get the record store data and group by columns
|
|
record_store_data = record_store_with_groupby_rdd.record_store_data
|
|
|
|
groupby_columns_list = \
|
|
record_store_with_groupby_rdd.groupby_columns_list
|
|
|
|
# construct a group by key
|
|
# key1=value1^key2=value2^...
|
|
groupby_key_value = ""
|
|
for gcol in groupby_columns_list:
|
|
groupby_key_value = \
|
|
"^".join((groupby_key_value,
|
|
"=".join((gcol, eval(".".join(("record_store_data",
|
|
gcol)))))))
|
|
|
|
# return a key-value rdd
|
|
# key is a composite key which consists of grouping key and
|
|
# event_timestamp_string
|
|
return [[groupby_key_value,
|
|
record_store_data.event_timestamp_string], record_store_data]
|
|
|
|
@staticmethod
|
|
def _get_partition_by_group(group_composite):
|
|
"""get a hash of the grouping key,
|
|
|
|
which is then used by partitioning
|
|
function to get partition where the groups data should end up in.
|
|
It uses hash % num_partitions to get partition
|
|
"""
|
|
# FIXME: find out of hash function in python gives same value on
|
|
# different machines
|
|
# Look at using portable_hash method in spark rdd
|
|
grouping_key = group_composite[0]
|
|
grouping_key_hash = hash(grouping_key)
|
|
# log_debug("groupby_sortby_timestamp_partition: got hash : %s" \
|
|
# % str(returnhash))
|
|
return grouping_key_hash
|
|
|
|
@staticmethod
|
|
def _sortby_timestamp(group_composite):
|
|
"""get timestamp which will be used to sort grouped data"""
|
|
event_timestamp_string = group_composite[1]
|
|
return event_timestamp_string
|
|
|
|
@staticmethod
|
|
def _group_sortby_timestamp_partition(record_store_df,
|
|
groupby_columns_list,
|
|
num_of_groups):
|
|
"""It does a group by and then sorts all the items within the group by event timestamp."""
|
|
# convert the dataframe rdd to normal rdd and add the group by
|
|
# column list
|
|
record_store_with_groupby_rdd = record_store_df.rdd.\
|
|
map(lambda x: RecordStoreWithGroupBy(x, groupby_columns_list))
|
|
|
|
# prepare the data for repartitionAndSortWithinPartitions function
|
|
record_store_rdd_prepared = \
|
|
record_store_with_groupby_rdd.\
|
|
map(GroupSortbyTimestampPartition._prepare_for_groupby)
|
|
|
|
# repartition data based on a grouping key and sort the items within
|
|
# group by timestamp
|
|
# give high number of partitions
|
|
# numPartitions > number of groups expected, so that each group gets
|
|
# allocated a separate partition
|
|
record_store_rdd_partitioned_sorted = \
|
|
record_store_rdd_prepared.\
|
|
repartitionAndSortWithinPartitions(
|
|
numPartitions=num_of_groups,
|
|
partitionFunc=GroupSortbyTimestampPartition.
|
|
_get_partition_by_group,
|
|
keyfunc=GroupSortbyTimestampPartition.
|
|
_sortby_timestamp)
|
|
|
|
return record_store_rdd_partitioned_sorted
|
|
|
|
@staticmethod
|
|
def _remove_none_filter(row):
|
|
"""remove any rows which have None as grouping key
|
|
|
|
[GroupingResults(grouping_key="key1", results={})] rows get created
|
|
when partition does not get any grouped data assigned to it
|
|
"""
|
|
if len(row[0].results) > 0 and row[0].grouping_key is not None:
|
|
return row
|
|
|
|
@staticmethod
|
|
def fetch_group_first_last_quantity(record_store_df,
|
|
transform_spec_df,
|
|
groupby_columns_list,
|
|
num_of_groups):
|
|
"""Function to group record store data
|
|
|
|
Sort by timestamp within group
|
|
and get first and last timestamp along with quantity within each group
|
|
|
|
To do group by it uses custom partitioning function which creates a new
|
|
partition
|
|
for each group and uses RDD's repartitionAndSortWithinPartitions
|
|
function to do the grouping and sorting within the group.
|
|
|
|
This is more scalable than just using RDD's groupby as using this
|
|
technique
|
|
group is not materialized into a list and stored in memory, but rather
|
|
it uses RDD's in built partitioning capability to do the sort
|
|
|
|
num_of_groups should be more than expected groups, otherwise the same
|
|
partition can get used for two groups which will cause incorrect
|
|
results.
|
|
"""
|
|
|
|
# group and order elements in group using repartition
|
|
record_store_grouped_data_rdd = \
|
|
GroupSortbyTimestampPartition.\
|
|
_group_sortby_timestamp_partition(record_store_df,
|
|
groupby_columns_list,
|
|
num_of_groups)
|
|
|
|
# do some operations on all elements in the group
|
|
grouping_results_tuple_with_none = \
|
|
record_store_grouped_data_rdd.\
|
|
mapPartitions(GroupSortbyTimestampPartition.
|
|
_get_group_first_last_quantity_udf)
|
|
|
|
# filter all rows which have no data (where grouping key is None) and
|
|
# convert resuts into grouping results tuple
|
|
grouping_results_tuple1 = grouping_results_tuple_with_none.\
|
|
filter(GroupSortbyTimestampPartition._remove_none_filter)
|
|
|
|
grouping_results_tuple = grouping_results_tuple1.map(lambda x: x[0])
|
|
|
|
return grouping_results_tuple
|