Totally rewrite s3_hadoop
Remedying of patchings, version conflicts, classpath issues, etc. ALSO: Switch the Hadoop libraries used on the Spark standalone plugin to Hadoop 2.7.3. The version was previously 2.6.5, to match Cloudera's so-called "Hadoop 2.6.0", but in fact this concordance is not at all necessary... Change-Id: Iafafb64fd60a1ae585375a68173c84fbb82c7e1f
This commit is contained in:
parent
83224a6c5e
commit
7910521a7e
@ -622,6 +622,11 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
|
|||||||
export DIB_RELEASE=${DIB_RELEASE:-trusty}
|
export DIB_RELEASE=${DIB_RELEASE:-trusty}
|
||||||
export DIB_CDH_VERSION="5.5"
|
export DIB_CDH_VERSION="5.5"
|
||||||
fi
|
fi
|
||||||
|
if [ "$DIB_SPARK_VERSION" = "1.6.0" ]; then
|
||||||
|
export SPARK_HADOOP_DL=hadoop2.6
|
||||||
|
else
|
||||||
|
export SPARK_HADOOP_DL=hadoop2.7
|
||||||
|
fi
|
||||||
# Tell the cloudera element to install only hdfs
|
# Tell the cloudera element to install only hdfs
|
||||||
export DIB_CDH_HDFS_ONLY=1
|
export DIB_CDH_HDFS_ONLY=1
|
||||||
|
|
||||||
@ -630,6 +635,7 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
|
|||||||
|
|
||||||
# Creating Ubuntu cloud image
|
# Creating Ubuntu cloud image
|
||||||
image_create ubuntu $ubuntu_image_name $ubuntu_elements_sequence
|
image_create ubuntu $ubuntu_image_name $ubuntu_elements_sequence
|
||||||
|
unset SPARK_HADOOP_DL
|
||||||
unset DIB_CLOUD_INIT_DATASOURCES
|
unset DIB_CLOUD_INIT_DATASOURCES
|
||||||
unset DIB_HDFS_LIB_DIR
|
unset DIB_HDFS_LIB_DIR
|
||||||
unset DIB_CDH_HDFS_ONLY
|
unset DIB_CDH_HDFS_ONLY
|
||||||
|
@ -1,60 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
|
|
||||||
set -x
|
|
||||||
fi
|
|
||||||
set -eu
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
|
|
||||||
case "$plugin_type" in
|
|
||||||
"vanilla" )
|
|
||||||
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
|
|
||||||
HADOOP_ENV_SH_PATH="/opt/hadoop/etc/hadoop/hadoop-env.sh"
|
|
||||||
SPARK_JARS_DIR_PATH="/opt/spark/jars"
|
|
||||||
;;
|
|
||||||
"spark" )
|
|
||||||
HADOOP_TOOLS_DIR_PATH="/usr/lib/hadoop/client"
|
|
||||||
SPARK_JARS_DIR_PATH="/opt/spark/jars"
|
|
||||||
;;
|
|
||||||
"cloudera" )
|
|
||||||
echo -n "The s3_hadoop element is not supported on CDH,"
|
|
||||||
echo " because the relevant libraries are already in the right place."
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "The s3_hadoop element is only supported on Vanilla and Spark."
|
|
||||||
exit 1
|
|
||||||
esac
|
|
||||||
|
|
||||||
# NOTE: By defintion, the Spark standalone plugin does not contain Hadoop in
|
|
||||||
# its entirety. Therefore, there are no Hadoop-specific environment settings
|
|
||||||
# available for modification.
|
|
||||||
if [ "$plugin_type" != "spark" ]; then
|
|
||||||
if [ -f "$HADOOP_ENV_SH_PATH" ]; then
|
|
||||||
cat >> $HADOOP_ENV_SH_PATH <<EOF
|
|
||||||
for f in $HADOOP_TOOLS_DIR_PATH/*.jar; do
|
|
||||||
if [ "\$HADOOP_CLASSPATH" ]; then
|
|
||||||
export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:\$f
|
|
||||||
else
|
|
||||||
export HADOOP_CLASSPATH=\$f
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
EOF
|
|
||||||
else
|
|
||||||
echo "Something went wrong: couldn't find Hadoop env settings."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -d "$SPARK_JARS_DIR_PATH" ]; then
|
|
||||||
cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $SPARK_JARS_DIR_PATH
|
|
||||||
chmod 0644 $SPARK_JARS_DIR_PATH/*aws*jar
|
|
||||||
else
|
|
||||||
# NOTE: In the case of Vanilla, the user may have disabled the Spark
|
|
||||||
# element. So, check for the existence of the directory explicitly, but
|
|
||||||
# crucially do do not consider it an error if the folder does not exist.
|
|
||||||
if [ "$plugin_type" != "vanilla" ]; then
|
|
||||||
echo "Something went wrong: couldn't find Spark installation."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
54
elements/s3_hadoop/post-install.d/89-manipulate-s3
Executable file
54
elements/s3_hadoop/post-install.d/89-manipulate-s3
Executable file
@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
|
||||||
|
set -x
|
||||||
|
fi
|
||||||
|
set -eu
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
|
||||||
|
case "$plugin_type" in
|
||||||
|
"vanilla" | "spark" )
|
||||||
|
;;
|
||||||
|
"cloudera" )
|
||||||
|
echo -n "The s3_hadoop element is not supported on CDH,"
|
||||||
|
echo " because the relevant libraries need no manipulation."
|
||||||
|
# NOTE: actually the above statement is only true on CDH>=5.9
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
# TODO: Investigate if some changes are in fact needed for HDP, MapR
|
||||||
|
echo "The s3_hadoop element is only relevant to Vanilla and Spark."
|
||||||
|
exit 1
|
||||||
|
esac
|
||||||
|
|
||||||
|
SPARK_JARS_DIR_PATH="/opt/spark/jars"
|
||||||
|
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
|
||||||
|
HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib"
|
||||||
|
|
||||||
|
if [ "$plugin_type" = "vanilla" ]; then
|
||||||
|
if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" ]; then
|
||||||
|
# These versions need a patched hadoop-aws jar
|
||||||
|
wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-$DIB_HADOOP_VERSION.jar -O $HADOOP_TOOLS_DIR_PATH/hadoop-aws-$DIB_HADOOP_VERSION.jar
|
||||||
|
fi
|
||||||
|
|
||||||
|
# NOTE: It's easier just to copy, than to mess with YARN
|
||||||
|
cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $HADOOP_COMMON_DIR_PATH
|
||||||
|
if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" -o "$DIB_HADOOP_VERSION" = "2.8.2" ]; then
|
||||||
|
# Hadoop-aws older than 2.9.0 needs these too
|
||||||
|
cp $HADOOP_TOOLS_DIR_PATH/joda-time*.jar $HADOOP_COMMON_DIR_PATH
|
||||||
|
# The following jars are also on-disk, but under the wrong namespace
|
||||||
|
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.5.3/jackson-core-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-core.jar
|
||||||
|
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.5.3/jackson-databind-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-databind.jar
|
||||||
|
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.5.3/jackson-annotations-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-annotations.jar
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# For both Spark and Vanilla plugins:
|
||||||
|
# (The s3a driver in hadoop-aws 2.6.5 is too buggy to be redeemed)
|
||||||
|
if [ "$SPARK_HADOOP_DL" != "hadoop2.6" ]; then
|
||||||
|
# The hadoop-aws and aws-java-sdk libraries are missing here, but we
|
||||||
|
# cannot copy them from the Hadoop folder on-disk due to
|
||||||
|
# version/patching issues
|
||||||
|
wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -O $SPARK_JARS_DIR_PATH/hadoop-aws.jar
|
||||||
|
wget http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -O $SPARK_JARS_DIR_PATH/aws-java-sdk.jar
|
||||||
|
fi
|
@ -13,13 +13,14 @@ mkdir -p $tmp_dir
|
|||||||
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
|
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
|
||||||
# Check hadoop version
|
# Check hadoop version
|
||||||
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
|
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
|
||||||
|
# Now the below is just a sanity check
|
||||||
if [ -z "${SPARK_HADOOP_DL:-}" ]; then
|
if [ -z "${SPARK_HADOOP_DL:-}" ]; then
|
||||||
case "${DIB_CDH_VERSION:-}" in
|
case "${DIB_CDH_VERSION:-}" in
|
||||||
5.5)
|
5.5)
|
||||||
SPARK_HADOOP_DL=hadoop2.6
|
SPARK_HADOOP_DL=hadoop2.7
|
||||||
;;
|
;;
|
||||||
5.11)
|
5.11)
|
||||||
SPARK_HADOOP_DL=hadoop2.6
|
SPARK_HADOOP_DL=hadoop2.7
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "WARNING: Cloudera CDH $DIB_CDH_VERSION not supported."
|
echo "WARNING: Cloudera CDH $DIB_CDH_VERSION not supported."
|
||||||
|
Loading…
Reference in New Issue
Block a user