add impl of SparkWordCount example

Change-Id: Ie377e3313f68cde6d580b93be8e15e364fc56718
2016-09-13 15:07:20 +03:00 · 2016-09-13 15:07:20 +03:00 · 881409ed7b
parent e25d9d7c74
commit 881409ed7b
4 changed files with 100 additions and 0 deletions
--- a/sahara_tests/scenario/defaults/edp-examples/edp-spark/spark-wordcount.jar
+++ b/sahara_tests/scenario/defaults/edp-examples/edp-spark/spark-wordcount.jar
--- a/sahara_tests/scenario/defaults/edp-examples/edp-spark/wordcountapp/pom.xml
+++ b/sahara_tests/scenario/defaults/edp-examples/edp-spark/wordcountapp/pom.xml
@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed under the Apache License, Version 2.0 (the "License"); you may
+    not use this file except in compliance with the License. You may obtain
+    copy of the License at
+
+         http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+    License for the specific language governing permissions and limitations
+    under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>sahara.edp.spark</groupId>
+  <artifactId>sparkwordcount</artifactId>
+  <version>0.0.1-SNAPSHOT</version>
+  <packaging>jar</packaging>
+  <name>"Spark Word Count"</name>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>2.10.4</version>
+    </dependency>
+    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10 -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_2.10</artifactId>
+      <version>1.6.0</version>
+    </dependency>
+  </dependencies>
+</project>
--- a/sahara_tests/scenario/defaults/edp-examples/edp-spark/wordcountapp/src/main/scala/sahara/edp/spark/SparkWordCount.scala
+++ b/sahara_tests/scenario/defaults/edp-examples/edp-spark/wordcountapp/src/main/scala/sahara/edp/spark/SparkWordCount.scala
@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sahara.edp.spark
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.SparkConf
+
+object SparkWordCount {
+  def main(args: Array[String]) {
+    val sc = new SparkContext(new SparkConf().setAppName("Spark Count"))
+    val dfsFileName = args(1)
+    // split each document into words
+    val tokenized = sc.textFile(args(0)).flatMap(_.split(" "))
+
+    // count the occurrence of each word
+    val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _)
+
+    val fileRDD = sc.parallelize(wordCounts.collect())
+    fileRDD.saveAsTextFile(dfsFileName)
+  }
+}
--- a/sahara_tests/scenario/defaults/edp.yaml.mako
+++ b/sahara_tests/scenario/defaults/edp.yaml.mako
@ -85,6 +85,7 @@ edp_jobs_flow:
        fs.swift.service.sahara.password: ${os_password}
      args:
        - '{input_datasource}'
+        - '{output_datasource}'
  mapr_pig_job:
    - type: Pig
      input_datasource: