Files
deb-python-taskflow/taskflow/examples/simple_map_reduce.py
Joshua Harlow 6520b9c35e Add a basic map/reduce example to show how this can be done
Since we can create tasks, run them in a parallel then direct
there result into a result task we can create a workflow that
can define how this can be accomplished and have the map ops
run in parallel (with the reduction op happening after all the
map ops have finished).

Part of blueprint more-examples

Change-Id: I7c04f5508b35b945c49e5798ece0e298d2e1b979
2014-12-17 19:13:45 +00:00

116 lines
3.7 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (C) 2014 Yahoo! Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import os
import sys
logging.basicConfig(level=logging.ERROR)
self_dir = os.path.abspath(os.path.dirname(__file__))
top_dir = os.path.abspath(os.path.join(os.path.dirname(__file__),
os.pardir,
os.pardir))
sys.path.insert(0, top_dir)
sys.path.insert(0, self_dir)
# INTRO: this examples shows a simplistic map/reduce implementation where
# a set of mapper(s) will sum a series of input numbers (in parallel) and
# return there individual summed result. A reducer will then use those
# produced values and perform a final summation and this result will then be
# printed (and verified to ensure the calculation was as expected).
import six
from taskflow import engines
from taskflow.patterns import linear_flow
from taskflow.patterns import unordered_flow
from taskflow import task
class SumMapper(task.Task):
def execute(self, inputs):
# Sums some set of provided inputs.
return sum(inputs)
class TotalReducer(task.Task):
def execute(self, *args, **kwargs):
# Reduces all mapped summed outputs into a single value.
total = 0
for (k, v) in six.iteritems(kwargs):
# If any other kwargs was passed in, we don't want to use those
# in the calculation of the total...
if k.startswith('reduction_'):
total += v
return total
def chunk_iter(chunk_size, upperbound):
"""Yields back chunk size pieces from zero to upperbound - 1."""
chunk = []
for i in range(0, upperbound):
chunk.append(i)
if len(chunk) == chunk_size:
yield chunk
chunk = []
# Upper bound of numbers to sum for example purposes...
UPPER_BOUND = 10000
# How many mappers we want to have.
SPLIT = 10
# How big of a chunk we want to give each mapper.
CHUNK_SIZE = UPPER_BOUND // SPLIT
# This will be the workflow we will compose and run.
w = linear_flow.Flow("root")
# The mappers will run in parallel.
store = {}
provided = []
mappers = unordered_flow.Flow('map')
for i, chunk in enumerate(chunk_iter(CHUNK_SIZE, UPPER_BOUND)):
mapper_name = 'mapper_%s' % i
# Give that mapper some information to compute.
store[mapper_name] = chunk
# The reducer uses all of the outputs of the mappers, so it needs
# to be recorded that it needs access to them (under a specific name).
provided.append("reduction_%s" % i)
mappers.add(SumMapper(name=mapper_name,
rebind={'inputs': mapper_name},
provides=provided[-1]))
w.add(mappers)
# The reducer will run last (after all the mappers).
w.add(TotalReducer('reducer', requires=provided))
# Now go!
e = engines.load(w, engine='parallel', store=store, max_workers=4)
print("Running a parallel engine with options: %s" % e.options)
e.run()
# Now get the result the reducer created.
total = e.storage.get('reducer')
print("Calculated result = %s" % total)
# Calculate it manually to verify that it worked...
calc_total = sum(range(0, UPPER_BOUND))
if calc_total != total:
sys.exit(1)