Adding Spark to sahara-image-pack

Adding the ability to create spark images using the new image
generation.

Change-Id: I220a05d782f749d3799fc37aa7d787d2fd8993b4
This commit is contained in:
Telles Nobrega 2019-01-21 14:49:23 -03:00
parent 3ac6ec00c9
commit 3a1da9f208
14 changed files with 350 additions and 0 deletions

View File

@ -24,12 +24,28 @@ For cluster provisioning, prepared images should be used.
(build parameter)
- Notes
* - 2.3
- Ubuntu 16.04, CentOS 7
- sahara-image-pack
- 2.3
- based on CDH 5.11
use --plugin_version to specify the minor version: 2.3.2 (default),
2.3.1 or 2.3.0
* - 2.3
- Ubuntu 16.04
- sahara-image-create
- 2.3.0
- based on CDH 5.11
* - 2.2
- Ubuntu 16.04, CentOS 7
- sahara-image-pack
- 2.2
- based on CDH 5.11
use --plugin_version to specify the minor version: 2.2.1 (default),
or 2.2.0
* - 2.2
- Ubuntu 16.04
- sahara-image-create

View File

@ -0,0 +1,4 @@
---
features:
- |
Adding abilitiy to create spark images using Sahara Image Pack.

View File

@ -0,0 +1,44 @@
# Copyright (c) 2019 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sahara.plugins import images
from sahara.plugins import utils as plugin_utils
_validator = images.SaharaImageValidator.from_yaml(
'plugins/spark/resources/images/image.yaml',
resource_roots=['plugins/spark/resources/images'],
package='sahara_plugin_spark')
def get_image_arguments():
return _validator.get_argument_list()
def pack_image(remote, test_only=False, image_arguments=None):
_validator.validate(remote, test_only=test_only,
image_arguments=image_arguments)
def validate_images(cluster, test_only=False, image_arguments=None):
image_arguments = get_image_arguments()
if not test_only:
instances = plugin_utils.get_instances(cluster)
else:
instances = plugin_utils.get_instances(cluster)[0]
for instance in instances:
with instance.remote() as r:
_validator.validate(r, test_only=test_only,
image_arguments=image_arguments)

View File

@ -30,6 +30,7 @@ from sahara.plugins import utils
from sahara_plugin_spark.i18n import _
from sahara_plugin_spark.plugins.spark import config_helper as c_helper
from sahara_plugin_spark.plugins.spark import edp_engine
from sahara_plugin_spark.plugins.spark import images
from sahara_plugin_spark.plugins.spark import run_scripts as run
from sahara_plugin_spark.plugins.spark import scaling as sc
from sahara_plugin_spark.plugins.spark import shell_engine
@ -569,3 +570,19 @@ class SparkProvider(p.ProvisioningPluginBase):
want_to_configure, self.get_configs(
cluster.hadoop_version), cluster, scaling)
provider.apply_recommended_configs()
def get_image_arguments(self, hadoop_version):
if hadoop_version in ['1.6.0', '2.1.0']:
return NotImplemented
return images.get_image_arguments()
def pack_image(self, hadoop_version, remote,
test_only=False, image_arguments=None):
images.pack_image(remote, test_only=test_only,
image_arguments=image_arguments)
def validate_images(self, cluster, test_only=False, image_arguments=None):
if cluster.hadoop_version not in ['1.6.0', '2.1.0']:
images.validate_images(cluster,
test_only=test_only,
image_arguments=image_arguments)

View File

@ -0,0 +1,7 @@
#!/bin/bash
if [ $test_only -eq 0 ]; then
systemctl stop hadoop-hdfs-datanode
systemctl stop hadoop-hdfs-namenode
else
exit 0
fi

View File

@ -0,0 +1,43 @@
#!/bin/bash
CDH_VERSION=5.11
CDH_MINOR_VERSION=5.11.0
if [ ! -f /etc/yum.repos.d/cloudera-cdh5.repo ]; then
if [ $test_only -eq 0 ]; then
echo '[cloudera-cdh5]' > /etc/yum.repos.d/cloudera-cdh5.repo
echo "name=Cloudera's Distribution for Hadoop, Version 5" >> /etc/yum.repos.d/cloudera-cdh5.repo
echo "baseurl=http://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/$CDH_MINOR_VERSION/" >> /etc/yum.repos.d/cloudera-cdh5.repo
echo "gpgkey = http://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/cloudera-cdh5.repo
echo 'gpgcheck = 1' >> /etc/yum.repos.d/cloudera-cdh5.repo
echo '[cloudera-manager]' > /etc/yum.repos.d/cloudera-manager.repo
echo 'name=Cloudera Manager' >> /etc/yum.repos.d/cloudera-manager.repo
echo "baseurl=http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/$CDH_MINOR_VERSION/" >> /etc/yum.repos.d/cloudera-manager.repo
echo "gpgkey = http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/cloudera-manager.repo
echo 'gpgcheck = 1' >> /etc/yum.repos.d/cloudera-manager.repo
echo '[navigator-keytrustee]' > /etc/yum.repos.d/kms.repo
echo "name=Cloudera's Distribution for navigator-Keytrustee, Version 5" >> /etc/yum.repos.d/kms.repo
RETURN_CODE="$(curl -s -o /dev/null -w "%{http_code}" http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/$CDH_MINOR_VERSION/)"
if [ "$RETURN_CODE" == "404" ]; then
echo "baseurl=http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/$CDH_VERSION/" >> /etc/yum.repos.d/kms.repo
else
echo "baseurl=http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/$CDH_MINOR_VERSION/" >> /etc/yum.repos.d/kms.repo
fi
echo "gpgkey = http://archive.cloudera.com/navigator-keytrustee5/redhat/7/x86_64/navigator-keytrustee/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/kms.repo
echo 'gpgcheck = 1' >> /etc/yum.repos.d/kms.repo
echo "[cloudera-kafka]" > /etc/yum.repos.d/cloudera-kafka.repo
echo "name=Cloudera's Distribution for kafka, Version 2.2.0" >> /etc/yum.repos.d/cloudera-kafka.repo
echo "baseurl=http://archive.cloudera.com/kafka/redhat/7/x86_64/kafka/2.2.0/" >> /etc/yum.repos.d/cloudera-kafka.repo
echo "gpgkey = http://archive.cloudera.com/kafka/redhat/7/x86_64/kafka/RPM-GPG-KEY-cloudera" >> /etc/yum.repos.d/cloudera-kafka.repo
echo "gpgcheck = 1" >> /etc/yum.repos.d/cloudera-kafka.repo
yum clean all
else
exit 0
fi
fi

View File

@ -0,0 +1,20 @@
#!/bin/bash
hadoop="2.6.0"
HDFS_LIB_DIR=${hdfs_lib_dir:-"/usr/share/hadoop/lib"}
HADOOP_SWIFT_JAR_NAME="hadoop-openstack.jar"
if [ $test_only -eq 0 ]; then
mkdir -p $HDFS_LIB_DIR
curl -sS -o $HDFS_LIB_DIR/$HADOOP_SWIFT_JAR_NAME $swift_url
if [ $? -ne 0 ]; then
echo -e "Could not download Swift Hadoop FS implementation.\nAborting"
exit 1
fi
chmod 0644 $HDFS_LIB_DIR/$HADOOP_SWIFT_JAR_NAME
else
exit 0
fi

View File

@ -0,0 +1,30 @@
#!/bin/bash
EXTJS_DESTINATION_DIR="/var/lib/oozie"
EXTJS_DOWNLOAD_URL="https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/ext-2.2.zip"
extjs_basepath=$(basename ${EXTJS_DOWNLOAD_URL})
extjs_archive=/tmp/${extjs_basepath}
extjs_folder="${extjs_basepath%.*}"
setup_extjs() {
curl -sS -o $extjs_archive $EXTJS_DOWNLOAD_URL
mkdir -p $EXTJS_DESTINATION_DIR
}
if [ -z "${EXTJS_NO_UNPACK:-}" ]; then
if [ ! -d "${EXTJS_DESTINATION_DIR}/${extjs_folder}" ]; then
setup_extjs
unzip -o -d "$EXTJS_DESTINATION_DIR" $extjs_archive
rm -f $extjs_archive
else
exit 0
fi
else
if [ ! -f "${EXTJS_DESTINATION_DIR}/${extjs_basepath}" ]; then
setup_extjs
mv $extjs_archive $EXTJS_DESTINATION_DIR
else
exit 0
fi
fi

View File

@ -0,0 +1,41 @@
#!/bin/bash
tmp_dir=/tmp/spark
CDH_VERSION=5.11
mkdir -p $tmp_dir
if [ ! -d /opt/spark ]; then
if [ $test_only -eq 0 ]; then
# The user is not providing his own Spark distribution package
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
# Check hadoop version
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
# Now the below is just a sanity check
if [ -z "${SPARK_HADOOP_DL:-}" ]; then
SPARK_HADOOP_DL=hadoop2.7
fi
SPARK_DOWNLOAD_URL="http://archive.apache.org/dist/spark/spark-$plugin_version/spark-$plugin_version-bin-$SPARK_HADOOP_DL.tgz"
fi
echo "Downloading SPARK"
spark_file=$(basename "$SPARK_DOWNLOAD_URL")
wget -O $tmp_dir/$spark_file $SPARK_DOWNLOAD_URL
echo "$SPARK_DOWNLOAD_URL" > $tmp_dir/spark_url.txt
echo "Extracting SPARK"
extract_folder=$(tar tzf $tmp_dir/$spark_file | sed -e 's@/.*@@' | uniq)
echo "Decompressing Spark..."
tar xzf $tmp_dir/$spark_file
rm $tmp_dir/$spark_file
echo "Moving SPARK to /opt/"
# Placing spark in /opt/spark
mv $extract_folder /opt/spark/
mv $tmp_dir/spark_url.txt /opt/spark/
rm -Rf $tmp_dir
else
exit 1
fi
fi

View File

@ -0,0 +1,12 @@
#!/bin/bash
SPARK_JARS_DIR_PATH="/opt/spark/jars"
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib"
# The hadoop-aws and aws-java-sdk libraries are missing here, but we
# cannot copy them from the Hadoop folder on-disk due to
# version/patching issues
curl -sS https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -o $SPARK_JARS_DIR_PATH/hadoop-aws.jar
curl -sS http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -o $SPARK_JARS_DIR_PATH/aws-java-sdk.jar

View File

@ -0,0 +1,61 @@
arguments:
plugin_version:
description: The version of Spark to install. Defaults to 2.3.2
default: 2.3.2
choices:
- 2.3.2
- 2.3.1
- 2.3.0
- 2.2.1
- 2.2.0
java_distro:
default: openjdk
description: The distribution of Java to install. Defaults to openjdk.
choices:
- openjdk
- oracle-java
hdfs_lib_dir:
default: /usr/lib/hadoop-mapreduce
description: The path to HDFS lib. Defaults to /usr/lib/hadoop-mapreduce.
required: False
swift_url:
default: https://tarballs.openstack.org/sahara-extra/dist/hadoop-openstack/master/hadoop-openstack-2.6.0.jar
description: Location of the swift jar file.
required: False
validators:
- os_case:
- redhat:
- package: wget
- script: centos/wget_cdh_repo
- ubuntu:
- script: ubuntu/wget_cdh_repo
- argument_case:
argument_name: java_distro
cases:
openjdk:
- os_case:
- redhat:
- package: java-1.8.0-openjdk-devel
- ubuntu:
- package: openjdk-8-jdk
- script:
common/install_spark:
env_vars: [plugin_version, cdh_version]
- os_case:
- ubuntu:
- script: ubuntu/config_spark
- package: ntp
- package:
- hadoop-hdfs-namenode
- hadoop-hdfs-datanode
- script: common/install_extjs
- os_case:
- redhat:
- script: centos/turn_off_services
- ubuntu:
- script: ubuntu/turn_off_services
- script: common/manipulate_s3
- script:
common/add_jar:
env_vars: [hdfs_lib_dir, swift_url]

View File

@ -0,0 +1,12 @@
#!/bin/bash
firstboot_script_name="/opt/spark/firstboot.sh"
sed -i -e "s,^exit 0$,[ -f $firstboot_script_name ] \&\& sh $firstboot_script_name; exit 0," /etc/rc.local
user_and_group_names="ubuntu:ubuntu"
cat >> $firstboot_script_name <<EOF
#!/bin/sh
chown -R $user_and_group_names /opt/spark
chown -R $user_and_group_names /etc/hadoop
rm $firstboot_script_name
EOF

View File

@ -0,0 +1,7 @@
#!/bin/bash
if [ $test_only -eq 0 ]; then
update-rc.d -f hadoop-hdfs-datanode remove
update-rc.d -f hadoop-hdfs-namenode remove
else
exit 0
fi

View File

@ -0,0 +1,36 @@
#!/bin/bash
CDH_VERSION=5.11
if [ ! -f /etc/apt/sources.list.d/cdh5.list ]; then
if [ $test_only -eq 0 ]; then
# Add repository with postgresql package (it's dependency of cloudera packages)
# Base image doesn't contain this repo
echo 'deb http://nova.clouds.archive.ubuntu.com/ubuntu/ xenial universe multiverse main' >> /etc/apt/sources.list
# Cloudera repositories
echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh$CDH_VERSION contrib" > /etc/apt/sources.list.d/cdh5.list
echo "deb-src http://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh$CDH_VERSION contrib" >> /etc/apt/sources.list.d/cdh5.list
wget -qO - http://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add -
echo "deb [arch=amd64] http://archive.cloudera.com/cm5/ubuntu/xenial/amd64/cm xenial-cm$CDH_VERSION contrib" > /etc/apt/sources.list.d/cm5.list
echo "deb-src http://archive.cloudera.com/cm5/ubuntu/xenial/amd64/cm xenial-cm$CDH_VERSION contrib" >> /etc/apt/sources.list.d/cm5.list
wget -qO - http://archive.cloudera.com/cm5/ubuntu/xenial/amd64/cm/archive.key | apt-key add -
wget -O /etc/apt/sources.list.d/kms.list http://archive.cloudera.com/navigator-keytrustee5/ubuntu/xenial/amd64/navigator-keytrustee/cloudera.list
wget -qO - http://archive.cloudera.com/navigator-keytrustee5/ubuntu/xenial/amd64/navigator-keytrustee/archive.key | apt-key add -
# add Kafka repository
echo 'deb http://archive.cloudera.com/kafka/ubuntu/xenial/amd64/kafka/ xenial-kafka2.2.0 contrib' >> /etc/apt/sources.list
wget -qO - https://archive.cloudera.com/kafka/ubuntu/xenial/amd64/kafka/archive.key | apt-key add -
#change repository priority
echo 'Package: zookeeper\nPin: origin "archive.cloudera.com"\nPin-Priority: 1001' > /etc/apt/preferences.d/cloudera-pin
apt-get update
else
exit 0
fi
fi