From 3a1da9f2080a98ec9c62b287c0be77344220f680 Mon Sep 17 00:00:00 2001 From: Telles Nobrega Date: Mon, 21 Jan 2019 14:49:23 -0300 Subject: [PATCH] Adding Spark to sahara-image-pack Adding the ability to create spark images using the new image generation. Change-Id: I220a05d782f749d3799fc37aa7d787d2fd8993b4 --- doc/source/user/spark-plugin.rst | 16 +++++ .../spark-on-image-pack-f5609daf38c45b6f.yaml | 4 ++ sahara_plugin_spark/plugins/spark/images.py | 44 +++++++++++++ sahara_plugin_spark/plugins/spark/plugin.py | 17 ++++++ .../resources/images/centos/turn_off_services | 7 +++ .../resources/images/centos/wget_cdh_repo | 43 +++++++++++++ .../spark/resources/images/common/add_jar | 20 ++++++ .../resources/images/common/install_extjs | 30 +++++++++ .../resources/images/common/install_spark | 41 +++++++++++++ .../resources/images/common/manipulate_s3 | 12 ++++ .../plugins/spark/resources/images/image.yaml | 61 +++++++++++++++++++ .../resources/images/ubuntu/config_spark | 12 ++++ .../resources/images/ubuntu/turn_off_services | 7 +++ .../resources/images/ubuntu/wget_cdh_repo | 36 +++++++++++ 14 files changed, 350 insertions(+) create mode 100644 releasenotes/notes/spark-on-image-pack-f5609daf38c45b6f.yaml create mode 100644 sahara_plugin_spark/plugins/spark/images.py create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/centos/turn_off_services create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/centos/wget_cdh_repo create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/common/add_jar create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/common/install_extjs create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/common/install_spark create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/common/manipulate_s3 create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/image.yaml create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/ubuntu/config_spark create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/ubuntu/turn_off_services create mode 100644 sahara_plugin_spark/plugins/spark/resources/images/ubuntu/wget_cdh_repo diff --git a/doc/source/user/spark-plugin.rst b/doc/source/user/spark-plugin.rst index 0dd9f53..a2a9581 100644 --- a/doc/source/user/spark-plugin.rst +++ b/doc/source/user/spark-plugin.rst @@ -24,12 +24,28 @@ For cluster provisioning, prepared images should be used. (build parameter) - Notes + * - 2.3 + - Ubuntu 16.04, CentOS 7 + - sahara-image-pack + - 2.3 + - based on CDH 5.11 + use --plugin_version to specify the minor version: 2.3.2 (default), + 2.3.1 or 2.3.0 + * - 2.3 - Ubuntu 16.04 - sahara-image-create - 2.3.0 - based on CDH 5.11 + * - 2.2 + - Ubuntu 16.04, CentOS 7 + - sahara-image-pack + - 2.2 + - based on CDH 5.11 + use --plugin_version to specify the minor version: 2.2.1 (default), + or 2.2.0 + * - 2.2 - Ubuntu 16.04 - sahara-image-create diff --git a/releasenotes/notes/spark-on-image-pack-f5609daf38c45b6f.yaml b/releasenotes/notes/spark-on-image-pack-f5609daf38c45b6f.yaml new file mode 100644 index 0000000..ce5d569 --- /dev/null +++ b/releasenotes/notes/spark-on-image-pack-f5609daf38c45b6f.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Adding abilitiy to create spark images using Sahara Image Pack. diff --git a/sahara_plugin_spark/plugins/spark/images.py b/sahara_plugin_spark/plugins/spark/images.py new file mode 100644 index 0000000..ebec619 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/images.py @@ -0,0 +1,44 @@ +# Copyright (c) 2019 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sahara.plugins import images +from sahara.plugins import utils as plugin_utils + + +_validator = images.SaharaImageValidator.from_yaml( + 'plugins/spark/resources/images/image.yaml', + resource_roots=['plugins/spark/resources/images'], + package='sahara_plugin_spark') + + +def get_image_arguments(): + return _validator.get_argument_list() + + +def pack_image(remote, test_only=False, image_arguments=None): + _validator.validate(remote, test_only=test_only, + image_arguments=image_arguments) + + +def validate_images(cluster, test_only=False, image_arguments=None): + image_arguments = get_image_arguments() + if not test_only: + instances = plugin_utils.get_instances(cluster) + else: + instances = plugin_utils.get_instances(cluster)[0] + for instance in instances: + with instance.remote() as r: + _validator.validate(r, test_only=test_only, + image_arguments=image_arguments) diff --git a/sahara_plugin_spark/plugins/spark/plugin.py b/sahara_plugin_spark/plugins/spark/plugin.py index 44f87a7..a34b08d 100644 --- a/sahara_plugin_spark/plugins/spark/plugin.py +++ b/sahara_plugin_spark/plugins/spark/plugin.py @@ -30,6 +30,7 @@ from sahara.plugins import utils from sahara_plugin_spark.i18n import _ from sahara_plugin_spark.plugins.spark import config_helper as c_helper from sahara_plugin_spark.plugins.spark import edp_engine +from sahara_plugin_spark.plugins.spark import images from sahara_plugin_spark.plugins.spark import run_scripts as run from sahara_plugin_spark.plugins.spark import scaling as sc from sahara_plugin_spark.plugins.spark import shell_engine @@ -569,3 +570,19 @@ class SparkProvider(p.ProvisioningPluginBase): want_to_configure, self.get_configs( cluster.hadoop_version), cluster, scaling) provider.apply_recommended_configs() + + def get_image_arguments(self, hadoop_version): + if hadoop_version in ['1.6.0', '2.1.0']: + return NotImplemented + return images.get_image_arguments() + + def pack_image(self, hadoop_version, remote, + test_only=False, image_arguments=None): + images.pack_image(remote, test_only=test_only, + image_arguments=image_arguments) + + def validate_images(self, cluster, test_only=False, image_arguments=None): + if cluster.hadoop_version not in ['1.6.0', '2.1.0']: + images.validate_images(cluster, + test_only=test_only, + image_arguments=image_arguments) diff --git a/sahara_plugin_spark/plugins/spark/resources/images/centos/turn_off_services b/sahara_plugin_spark/plugins/spark/resources/images/centos/turn_off_services new file mode 100644 index 0000000..9b626a7 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/centos/turn_off_services @@ -0,0 +1,7 @@ +#!/bin/bash +if [ $test_only -eq 0 ]; then + systemctl stop hadoop-hdfs-datanode + systemctl stop hadoop-hdfs-namenode +else + exit 0 +fi diff --git a/sahara_plugin_spark/plugins/spark/resources/images/centos/wget_cdh_repo b/sahara_plugin_spark/plugins/spark/resources/images/centos/wget_cdh_repo new file mode 100644 index 0000000..b2b63f6 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/centos/wget_cdh_repo @@ -0,0 +1,43 @@ +#!/bin/bash + +CDH_VERSION=5.11 +CDH_MINOR_VERSION=5.11.0 + +if [ ! -f /etc/yum.repos.d/cloudera-cdh5.repo ]; then + if [ $test_only -eq 0 ]; then + echo '[cloudera-cdh5]' > /etc/yum.repos.d/cloudera-cdh5.repo + echo "name=Cloudera's Distribution for Hadoop, Version 5" >> /etc/yum.repos.d/cloudera-cdh5.repo + echo "baseurl=http://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/$CDH_MINOR_VERSION/" >> /etc/yum.repos.d/cloudera-cdh5.repo + echo "gpgkey = http://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/cloudera-cdh5.repo + echo 'gpgcheck = 1' >> /etc/yum.repos.d/cloudera-cdh5.repo + + echo '[cloudera-manager]' > /etc/yum.repos.d/cloudera-manager.repo + echo 'name=Cloudera Manager' >> /etc/yum.repos.d/cloudera-manager.repo + echo "baseurl=http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/$CDH_MINOR_VERSION/" >> /etc/yum.repos.d/cloudera-manager.repo + echo "gpgkey = http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/cloudera-manager.repo + echo 'gpgcheck = 1' >> /etc/yum.repos.d/cloudera-manager.repo + + echo '[navigator-keytrustee]' > /etc/yum.repos.d/kms.repo + echo "name=Cloudera's Distribution for navigator-Keytrustee, Version 5" >> /etc/yum.repos.d/kms.repo + + RETURN_CODE="$(curl -s -o /dev/null -w "%{http_code}" http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/$CDH_MINOR_VERSION/)" + if [ "$RETURN_CODE" == "404" ]; then + echo "baseurl=http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/$CDH_VERSION/" >> /etc/yum.repos.d/kms.repo + else + echo "baseurl=http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/$CDH_MINOR_VERSION/" >> /etc/yum.repos.d/kms.repo + fi + + echo "gpgkey = http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/kms.repo + echo 'gpgcheck = 1' >> /etc/yum.repos.d/kms.repo + + echo "[cloudera-kafka]" > /etc/yum.repos.d/cloudera-kafka.repo + echo "name=Cloudera's Distribution for kafka, Version 2.2.0" >> /etc/yum.repos.d/cloudera-kafka.repo + echo "baseurl=http://archive.cloudera.com/kafka/redhat/7/x86_64/kafka/2.2.0/" >> /etc/yum.repos.d/cloudera-kafka.repo + echo "gpgkey = http://archive.cloudera.com/kafka/redhat/7/x86_64/kafka/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/cloudera-kafka.repo + echo "gpgcheck = 1" >> /etc/yum.repos.d/cloudera-kafka.repo + + yum clean all + else + exit 0 + fi +fi diff --git a/sahara_plugin_spark/plugins/spark/resources/images/common/add_jar b/sahara_plugin_spark/plugins/spark/resources/images/common/add_jar new file mode 100644 index 0000000..6547011 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/common/add_jar @@ -0,0 +1,20 @@ +#!/bin/bash + +hadoop="2.6.0" + +HDFS_LIB_DIR=${hdfs_lib_dir:-"/usr/share/hadoop/lib"} +HADOOP_SWIFT_JAR_NAME="hadoop-openstack.jar" + +if [ $test_only -eq 0 ]; then + mkdir -p $HDFS_LIB_DIR + curl -sS -o $HDFS_LIB_DIR/$HADOOP_SWIFT_JAR_NAME $swift_url + + if [ $? -ne 0 ]; then + echo -e "Could not download Swift Hadoop FS implementation.\nAborting" + exit 1 + fi + + chmod 0644 $HDFS_LIB_DIR/$HADOOP_SWIFT_JAR_NAME +else + exit 0 +fi diff --git a/sahara_plugin_spark/plugins/spark/resources/images/common/install_extjs b/sahara_plugin_spark/plugins/spark/resources/images/common/install_extjs new file mode 100644 index 0000000..1a2065c --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/common/install_extjs @@ -0,0 +1,30 @@ +#!/bin/bash + +EXTJS_DESTINATION_DIR="/var/lib/oozie" +EXTJS_DOWNLOAD_URL="https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/ext-2.2.zip" + +extjs_basepath=$(basename ${EXTJS_DOWNLOAD_URL}) +extjs_archive=/tmp/${extjs_basepath} +extjs_folder="${extjs_basepath%.*}" + +setup_extjs() { + curl -sS -o $extjs_archive $EXTJS_DOWNLOAD_URL + mkdir -p $EXTJS_DESTINATION_DIR +} + +if [ -z "${EXTJS_NO_UNPACK:-}" ]; then + if [ ! -d "${EXTJS_DESTINATION_DIR}/${extjs_folder}" ]; then + setup_extjs + unzip -o -d "$EXTJS_DESTINATION_DIR" $extjs_archive + rm -f $extjs_archive + else + exit 0 + fi +else + if [ ! -f "${EXTJS_DESTINATION_DIR}/${extjs_basepath}" ]; then + setup_extjs + mv $extjs_archive $EXTJS_DESTINATION_DIR + else + exit 0 + fi +fi diff --git a/sahara_plugin_spark/plugins/spark/resources/images/common/install_spark b/sahara_plugin_spark/plugins/spark/resources/images/common/install_spark new file mode 100644 index 0000000..7596f05 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/common/install_spark @@ -0,0 +1,41 @@ +#!/bin/bash + +tmp_dir=/tmp/spark +CDH_VERSION=5.11 +mkdir -p $tmp_dir + +if [ ! -d /opt/spark ]; then + if [ $test_only -eq 0 ]; then + # The user is not providing his own Spark distribution package + if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then + # Check hadoop version + # INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html + # Now the below is just a sanity check + if [ -z "${SPARK_HADOOP_DL:-}" ]; then + SPARK_HADOOP_DL=hadoop2.7 + fi + + SPARK_DOWNLOAD_URL="http://archive.apache.org/dist/spark/spark-$plugin_version/spark-$plugin_version-bin-$SPARK_HADOOP_DL.tgz" + fi + + echo "Downloading SPARK" + spark_file=$(basename "$SPARK_DOWNLOAD_URL") + wget -O $tmp_dir/$spark_file $SPARK_DOWNLOAD_URL + echo "$SPARK_DOWNLOAD_URL" > $tmp_dir/spark_url.txt + + echo "Extracting SPARK" + extract_folder=$(tar tzf $tmp_dir/$spark_file | sed -e 's@/.*@@' | uniq) + echo "Decompressing Spark..." + tar xzf $tmp_dir/$spark_file + rm $tmp_dir/$spark_file + + echo "Moving SPARK to /opt/" + # Placing spark in /opt/spark + mv $extract_folder /opt/spark/ + mv $tmp_dir/spark_url.txt /opt/spark/ + + rm -Rf $tmp_dir + else + exit 1 + fi +fi diff --git a/sahara_plugin_spark/plugins/spark/resources/images/common/manipulate_s3 b/sahara_plugin_spark/plugins/spark/resources/images/common/manipulate_s3 new file mode 100644 index 0000000..47b790d --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/common/manipulate_s3 @@ -0,0 +1,12 @@ +#!/bin/bash + +SPARK_JARS_DIR_PATH="/opt/spark/jars" +HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib" +HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib" + + +# The hadoop-aws and aws-java-sdk libraries are missing here, but we +# cannot copy them from the Hadoop folder on-disk due to +# version/patching issues +curl -sS https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -o $SPARK_JARS_DIR_PATH/hadoop-aws.jar +curl -sS http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -o $SPARK_JARS_DIR_PATH/aws-java-sdk.jar diff --git a/sahara_plugin_spark/plugins/spark/resources/images/image.yaml b/sahara_plugin_spark/plugins/spark/resources/images/image.yaml new file mode 100644 index 0000000..76ec382 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/image.yaml @@ -0,0 +1,61 @@ +arguments: + plugin_version: + description: The version of Spark to install. Defaults to 2.3.2 + default: 2.3.2 + choices: + - 2.3.2 + - 2.3.1 + - 2.3.0 + - 2.2.1 + - 2.2.0 + java_distro: + default: openjdk + description: The distribution of Java to install. Defaults to openjdk. + choices: + - openjdk + - oracle-java + hdfs_lib_dir: + default: /usr/lib/hadoop-mapreduce + description: The path to HDFS lib. Defaults to /usr/lib/hadoop-mapreduce. + required: False + swift_url: + default: https://tarballs.openstack.org/sahara-extra/dist/hadoop-openstack/master/hadoop-openstack-2.6.0.jar + description: Location of the swift jar file. + required: False + +validators: + - os_case: + - redhat: + - package: wget + - script: centos/wget_cdh_repo + - ubuntu: + - script: ubuntu/wget_cdh_repo + - argument_case: + argument_name: java_distro + cases: + openjdk: + - os_case: + - redhat: + - package: java-1.8.0-openjdk-devel + - ubuntu: + - package: openjdk-8-jdk + - script: + common/install_spark: + env_vars: [plugin_version, cdh_version] + - os_case: + - ubuntu: + - script: ubuntu/config_spark + - package: ntp + - package: + - hadoop-hdfs-namenode + - hadoop-hdfs-datanode + - script: common/install_extjs + - os_case: + - redhat: + - script: centos/turn_off_services + - ubuntu: + - script: ubuntu/turn_off_services + - script: common/manipulate_s3 + - script: + common/add_jar: + env_vars: [hdfs_lib_dir, swift_url] diff --git a/sahara_plugin_spark/plugins/spark/resources/images/ubuntu/config_spark b/sahara_plugin_spark/plugins/spark/resources/images/ubuntu/config_spark new file mode 100644 index 0000000..5f985b7 --- /dev/null +++ b/sahara_plugin_spark/plugins/spark/resources/images/ubuntu/config_spark @@ -0,0 +1,12 @@ +#!/bin/bash + +firstboot_script_name="/opt/spark/firstboot.sh" +sed -i -e "s,^exit 0$,[ -f $firstboot_script_name ] \&\& sh $firstboot_script_name; exit 0," /etc/rc.local +user_and_group_names="ubuntu:ubuntu" + +cat >> $firstboot_script_name <> /etc/apt/sources.list + + # Cloudera repositories + echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh$CDH_VERSION contrib" > /etc/apt/sources.list.d/cdh5.list + echo "deb-src http://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh$CDH_VERSION contrib" >> /etc/apt/sources.list.d/cdh5.list + + wget -qO - http://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - + + echo "deb [arch=amd64] http://archive.cloudera.com/cm5/ubuntu/xenial/amd64/cm xenial-cm$CDH_VERSION contrib" > /etc/apt/sources.list.d/cm5.list + echo "deb-src http://archive.cloudera.com/cm5/ubuntu/xenial/amd64/cm xenial-cm$CDH_VERSION contrib" >> /etc/apt/sources.list.d/cm5.list + + wget -qO - http://archive.cloudera.com/cm5/ubuntu/xenial/amd64/cm/archive.key | apt-key add - + + wget -O /etc/apt/sources.list.d/kms.list http://archive.cloudera.com/navigator-keytrustee5/ubuntu/xenial/amd64/navigator-keytrustee/cloudera.list + wget -qO - http://archive.cloudera.com/navigator-keytrustee5/ubuntu/xenial/amd64/navigator-keytrustee/archive.key | apt-key add - + + # add Kafka repository + echo 'deb http://archive.cloudera.com/kafka/ubuntu/xenial/amd64/kafka/ xenial-kafka2.2.0 contrib' >> /etc/apt/sources.list + wget -qO - https://archive.cloudera.com/kafka/ubuntu/xenial/amd64/kafka/archive.key | apt-key add - + + #change repository priority + echo 'Package: zookeeper\nPin: origin "archive.cloudera.com"\nPin-Priority: 1001' > /etc/apt/preferences.d/cloudera-pin + + apt-get update + else + exit 0 + fi +fi