diff --git a/doc/source/horizon/dashboard.user.guide.rst b/doc/source/horizon/dashboard.user.guide.rst index 231016b9..0cb323e3 100644 --- a/doc/source/horizon/dashboard.user.guide.rst +++ b/doc/source/horizon/dashboard.user.guide.rst @@ -240,10 +240,10 @@ will give a walkthrough on how to run those jobs via the Horizon UI. These steps assume that you already have a cluster up and running (in the "Active" state). 1) Sample Pig job - - https://github.com/openstack/sahara/tree/master/etc/edp-examples/pig-job + https://github.com/openstack/sahara/tree/master/etc/edp-examples/edp-pig/trim-spaces - Load the input data file from - https://github.com/openstack/sahara/tree/master/etc/edp-examples/pig-job/data/input + https://github.com/openstack/sahara/tree/master/etc/edp-examples/edp-pig/trim-spaces/data/input into swift - Click on Project/Object Store/Containers and create a container with any @@ -270,11 +270,11 @@ assume that you already have a cluster up and running (in the "Active" state). - Name = example.pig, Storage type = Internal database, click Browse and find example.pig wherever you checked out the sahara project - /etc/edp-examples/pig-job + /etc/edp-examples/edp-pig/trim-spaces - Create another Job Binary: Name = udf.jar, Storage type = Internal database, click Browse and find udf.jar wherever you checked out the - sahara project /etc/edp-examples/pig-job + sahara project /etc/edp-examples/edp-pig/trim-spaces - Create a Job diff --git a/etc/edp-examples/edp-pig/top-todoers/README.rst b/etc/edp-examples/edp-pig/top-todoers/README.rst new file mode 100644 index 00000000..2c1cb3b9 --- /dev/null +++ b/etc/edp-examples/edp-pig/top-todoers/README.rst @@ -0,0 +1,68 @@ +Top TODOers Pig job +=================== + +This script calculates top TODOers in input sources. + +Example of usage +---------------- + +This pig script can process as many input files (sources) as you want. +Just put all input files in a directory in HDFS or container in Swift and +give the path of the HDFS directory (Swift object) as input DataSource for EDP. + +Here are steps how to prepare input data: + +1. Create dir 'input' + +.. sourcecode:: console + + $ mkdir input + +2. Get some sources from GitHub and put it to 'input' directory: + +.. sourcecode:: console + + $ cd input + $ git clone "https://github.com/openstack/swift.git" + $ git clone "https://github.com/openstack/nova.git" + $ git clone "https://github.com/openstack/glance.git" + $ git clone "https://github.com/openstack/image-api.git" + $ git clone "https://github.com/openstack/neutron.git" + $ git clone "https://github.com/openstack/horizon.git" + $ git clone "https://github.com/openstack/python-novaclient.git" + $ git clone "https://github.com/openstack/python-keystoneclient.git" + $ git clone "https://github.com/openstack/oslo-incubator.git" + $ git clone "https://github.com/openstack/python-neutronclient.git" + $ git clone "https://github.com/openstack/python-glanceclient.git" + $ git clone "https://github.com/openstack/python-swiftclient.git" + $ git clone "https://github.com/openstack/python-cinderclient.git" + $ git clone "https://github.com/openstack/ceilometer.git" + $ git clone "https://github.com/openstack/cinder.git" + $ git clone "https://github.com/openstack/heat.git" + $ git clone "https://github.com/openstack/python-heatclient.git" + $ git clone "https://github.com/openstack/python-ceilometerclient.git" + $ git clone "https://github.com/openstack/oslo.config.git" + $ git clone "https://github.com/openstack/ironic.git" + $ git clone "https://github.com/openstack/python-ironicclient.git" + $ git clone "https://github.com/openstack/operations-guide.git" + $ git clone "https://github.com/openstack/keystone.git" + $ git clone "https://github.com/openstack/oslo.messaging.git" + $ git clone "https://github.com/openstack/oslo.sphinx.git" + $ git clone "https://github.com/openstack/oslo.version.git" + $ git clone "https://github.com/openstack/sahara.git" + $ git clone "https://github.com/openstack/python-saharaclient.git" + $ git clone "https://github.com/openstack/openstack.git" + $ cd .. + +3. Create single file containing all sources: + +.. sourcecode:: console + + tar -cf input.tar input/* + +.. note:: + + Pig can operate with raw files as well as with compressed data, so in this + step you might want to create *.gz file with sources and it should work. + +4. Upload input.tar to Swift or HDFS as input data source for EDP processing \ No newline at end of file diff --git a/etc/edp-examples/edp-pig/top-todoers/data/expected_output b/etc/edp-examples/edp-pig/top-todoers/data/expected_output new file mode 100644 index 00000000..f2eb5ac3 --- /dev/null +++ b/etc/edp-examples/edp-pig/top-todoers/data/expected_output @@ -0,0 +1,3 @@ +2 https://launchpad.net/~slukjanov +1 https://launchpad.net/~aignatov +1 https://launchpad.net/~mimccune \ No newline at end of file diff --git a/etc/edp-examples/edp-pig/top-todoers/data/input b/etc/edp-examples/edp-pig/top-todoers/data/input new file mode 100644 index 00000000..91f25eea --- /dev/null +++ b/etc/edp-examples/edp-pig/top-todoers/data/input @@ -0,0 +1,18 @@ +# There is some source file with TODO labels inside + + +def sum(a, b): + # TODO(slukjanov): implement how to add numbers + return None + +def sum(a, b): + # TODO(slukjanov): implement how to subtract numbers + return None + +def divide(a, b): + # TODO(aignatov): implement how to divide numbers + return None + +def mul(a, b): + # TODO(mimccune): implement how to multiply numbers + return None diff --git a/etc/edp-examples/edp-pig/top-todoers/example.pig b/etc/edp-examples/edp-pig/top-todoers/example.pig new file mode 100644 index 00000000..e35d05aa --- /dev/null +++ b/etc/edp-examples/edp-pig/top-todoers/example.pig @@ -0,0 +1,17 @@ +input_lines = LOAD '$INPUT' AS (line:chararray); + +-- filter out any lines that are not with TODO +todo_lines = FILTER input_lines BY line MATCHES '.*TODO\\s*\\(\\w+\\)+.*'; +ids = FOREACH todo_lines GENERATE FLATTEN(REGEX_EXTRACT($0, '(.*)\\((.*)\\)(.*)', 2)); + +-- create a group for each word +id_groups = GROUP ids BY $0; + +-- count the entries in each group +atc_count = FOREACH id_groups GENERATE COUNT(ids) AS count, group AS atc; + +-- order the records by count +result = ORDER atc_count BY count DESC; +result = FOREACH result GENERATE count, CONCAT('https://launchpad.net/~', atc); + +STORE result INTO '$OUTPUT' USING PigStorage(); diff --git a/etc/edp-examples/pig-job/README.rst b/etc/edp-examples/edp-pig/trim-spaces/README.rst similarity index 100% rename from etc/edp-examples/pig-job/README.rst rename to etc/edp-examples/edp-pig/trim-spaces/README.rst diff --git a/etc/edp-examples/pig-job/data/expected_output b/etc/edp-examples/edp-pig/trim-spaces/data/expected_output similarity index 100% rename from etc/edp-examples/pig-job/data/expected_output rename to etc/edp-examples/edp-pig/trim-spaces/data/expected_output diff --git a/etc/edp-examples/pig-job/data/input b/etc/edp-examples/edp-pig/trim-spaces/data/input similarity index 100% rename from etc/edp-examples/pig-job/data/input rename to etc/edp-examples/edp-pig/trim-spaces/data/input diff --git a/etc/edp-examples/pig-job/example.pig b/etc/edp-examples/edp-pig/trim-spaces/example.pig similarity index 100% rename from etc/edp-examples/pig-job/example.pig rename to etc/edp-examples/edp-pig/trim-spaces/example.pig diff --git a/etc/edp-examples/pig-job/udf.jar b/etc/edp-examples/edp-pig/trim-spaces/udf.jar similarity index 100% rename from etc/edp-examples/pig-job/udf.jar rename to etc/edp-examples/edp-pig/trim-spaces/udf.jar diff --git a/sahara/tests/integration/tests/edp.py b/sahara/tests/integration/tests/edp.py index 416a3314..3932da58 100644 --- a/sahara/tests/integration/tests/edp.py +++ b/sahara/tests/integration/tests/edp.py @@ -27,7 +27,7 @@ from sahara.utils import edp class EDPJobInfo(object): - PIG_PATH = 'etc/edp-examples/pig-job/' + PIG_PATH = 'etc/edp-examples/edp-pig/trim-spaces/' JAVA_PATH = 'etc/edp-examples/edp-java/' MAPREDUCE_PATH = 'etc/edp-examples/edp-mapreduce/' SPARK_PATH = 'etc/edp-examples/edp-spark/'