Totally rewrite s3_hadoop
Remedying of patchings, version conflicts, classpath issues, etc. ALSO: Switch the Hadoop libraries used on the Spark standalone plugin to Hadoop 2.7.3. The version was previously 2.6.5, to match Cloudera's so-called "Hadoop 2.6.0", but in fact this concordance is not at all necessary... Change-Id: Iafafb64fd60a1ae585375a68173c84fbb82c7e1f
This commit is contained in:
parent
83224a6c5e
commit
7910521a7e
@ -622,6 +622,11 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
|
||||
export DIB_RELEASE=${DIB_RELEASE:-trusty}
|
||||
export DIB_CDH_VERSION="5.5"
|
||||
fi
|
||||
if [ "$DIB_SPARK_VERSION" = "1.6.0" ]; then
|
||||
export SPARK_HADOOP_DL=hadoop2.6
|
||||
else
|
||||
export SPARK_HADOOP_DL=hadoop2.7
|
||||
fi
|
||||
# Tell the cloudera element to install only hdfs
|
||||
export DIB_CDH_HDFS_ONLY=1
|
||||
|
||||
@ -630,6 +635,7 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
|
||||
|
||||
# Creating Ubuntu cloud image
|
||||
image_create ubuntu $ubuntu_image_name $ubuntu_elements_sequence
|
||||
unset SPARK_HADOOP_DL
|
||||
unset DIB_CLOUD_INIT_DATASOURCES
|
||||
unset DIB_HDFS_LIB_DIR
|
||||
unset DIB_CDH_HDFS_ONLY
|
||||
|
@ -1,60 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
|
||||
set -x
|
||||
fi
|
||||
set -eu
|
||||
set -o pipefail
|
||||
|
||||
|
||||
case "$plugin_type" in
|
||||
"vanilla" )
|
||||
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
|
||||
HADOOP_ENV_SH_PATH="/opt/hadoop/etc/hadoop/hadoop-env.sh"
|
||||
SPARK_JARS_DIR_PATH="/opt/spark/jars"
|
||||
;;
|
||||
"spark" )
|
||||
HADOOP_TOOLS_DIR_PATH="/usr/lib/hadoop/client"
|
||||
SPARK_JARS_DIR_PATH="/opt/spark/jars"
|
||||
;;
|
||||
"cloudera" )
|
||||
echo -n "The s3_hadoop element is not supported on CDH,"
|
||||
echo " because the relevant libraries are already in the right place."
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "The s3_hadoop element is only supported on Vanilla and Spark."
|
||||
exit 1
|
||||
esac
|
||||
|
||||
# NOTE: By defintion, the Spark standalone plugin does not contain Hadoop in
|
||||
# its entirety. Therefore, there are no Hadoop-specific environment settings
|
||||
# available for modification.
|
||||
if [ "$plugin_type" != "spark" ]; then
|
||||
if [ -f "$HADOOP_ENV_SH_PATH" ]; then
|
||||
cat >> $HADOOP_ENV_SH_PATH <<EOF
|
||||
for f in $HADOOP_TOOLS_DIR_PATH/*.jar; do
|
||||
if [ "\$HADOOP_CLASSPATH" ]; then
|
||||
export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:\$f
|
||||
else
|
||||
export HADOOP_CLASSPATH=\$f
|
||||
fi
|
||||
done
|
||||
EOF
|
||||
else
|
||||
echo "Something went wrong: couldn't find Hadoop env settings."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -d "$SPARK_JARS_DIR_PATH" ]; then
|
||||
cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $SPARK_JARS_DIR_PATH
|
||||
chmod 0644 $SPARK_JARS_DIR_PATH/*aws*jar
|
||||
else
|
||||
# NOTE: In the case of Vanilla, the user may have disabled the Spark
|
||||
# element. So, check for the existence of the directory explicitly, but
|
||||
# crucially do do not consider it an error if the folder does not exist.
|
||||
if [ "$plugin_type" != "vanilla" ]; then
|
||||
echo "Something went wrong: couldn't find Spark installation."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
54
elements/s3_hadoop/post-install.d/89-manipulate-s3
Executable file
54
elements/s3_hadoop/post-install.d/89-manipulate-s3
Executable file
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
|
||||
set -x
|
||||
fi
|
||||
set -eu
|
||||
set -o pipefail
|
||||
|
||||
|
||||
case "$plugin_type" in
|
||||
"vanilla" | "spark" )
|
||||
;;
|
||||
"cloudera" )
|
||||
echo -n "The s3_hadoop element is not supported on CDH,"
|
||||
echo " because the relevant libraries need no manipulation."
|
||||
# NOTE: actually the above statement is only true on CDH>=5.9
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
# TODO: Investigate if some changes are in fact needed for HDP, MapR
|
||||
echo "The s3_hadoop element is only relevant to Vanilla and Spark."
|
||||
exit 1
|
||||
esac
|
||||
|
||||
SPARK_JARS_DIR_PATH="/opt/spark/jars"
|
||||
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
|
||||
HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib"
|
||||
|
||||
if [ "$plugin_type" = "vanilla" ]; then
|
||||
if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" ]; then
|
||||
# These versions need a patched hadoop-aws jar
|
||||
wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-$DIB_HADOOP_VERSION.jar -O $HADOOP_TOOLS_DIR_PATH/hadoop-aws-$DIB_HADOOP_VERSION.jar
|
||||
fi
|
||||
|
||||
# NOTE: It's easier just to copy, than to mess with YARN
|
||||
cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $HADOOP_COMMON_DIR_PATH
|
||||
if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" -o "$DIB_HADOOP_VERSION" = "2.8.2" ]; then
|
||||
# Hadoop-aws older than 2.9.0 needs these too
|
||||
cp $HADOOP_TOOLS_DIR_PATH/joda-time*.jar $HADOOP_COMMON_DIR_PATH
|
||||
# The following jars are also on-disk, but under the wrong namespace
|
||||
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.5.3/jackson-core-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-core.jar
|
||||
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.5.3/jackson-databind-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-databind.jar
|
||||
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.5.3/jackson-annotations-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-annotations.jar
|
||||
fi
|
||||
fi
|
||||
|
||||
# For both Spark and Vanilla plugins:
|
||||
# (The s3a driver in hadoop-aws 2.6.5 is too buggy to be redeemed)
|
||||
if [ "$SPARK_HADOOP_DL" != "hadoop2.6" ]; then
|
||||
# The hadoop-aws and aws-java-sdk libraries are missing here, but we
|
||||
# cannot copy them from the Hadoop folder on-disk due to
|
||||
# version/patching issues
|
||||
wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -O $SPARK_JARS_DIR_PATH/hadoop-aws.jar
|
||||
wget http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -O $SPARK_JARS_DIR_PATH/aws-java-sdk.jar
|
||||
fi
|
@ -13,13 +13,14 @@ mkdir -p $tmp_dir
|
||||
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
|
||||
# Check hadoop version
|
||||
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
|
||||
# Now the below is just a sanity check
|
||||
if [ -z "${SPARK_HADOOP_DL:-}" ]; then
|
||||
case "${DIB_CDH_VERSION:-}" in
|
||||
5.5)
|
||||
SPARK_HADOOP_DL=hadoop2.6
|
||||
SPARK_HADOOP_DL=hadoop2.7
|
||||
;;
|
||||
5.11)
|
||||
SPARK_HADOOP_DL=hadoop2.6
|
||||
SPARK_HADOOP_DL=hadoop2.7
|
||||
;;
|
||||
*)
|
||||
echo "WARNING: Cloudera CDH $DIB_CDH_VERSION not supported."
|
||||
|
Loading…
Reference in New Issue
Block a user