Totally rewrite s3_hadoop

Remedying of patchings, version conflicts, classpath issues, etc. ALSO: Switch the Hadoop libraries used on the Spark standalone plugin to Hadoop 2.7.3. The version was previously 2.6.5, to match Cloudera's so-called "Hadoop 2.6.0", but in fact this concordance is not at all necessary... Change-Id: Iafafb64fd60a1ae585375a68173c84fbb82c7e1f
2018-06-26 16:17:26 -04:00 · 2018-06-26 16:17:26 -04:00 · 7910521a7e
commit 7910521a7e
parent 83224a6c5e
4 changed files with 63 additions and 62 deletions
--- a/diskimage-create/diskimage-create.sh
+++ b/diskimage-create/diskimage-create.sh
@ -622,6 +622,11 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
        export DIB_RELEASE=${DIB_RELEASE:-trusty}
        export DIB_CDH_VERSION="5.5"
    fi
    if [ "$DIB_SPARK_VERSION" = "1.6.0" ]; then
        export SPARK_HADOOP_DL=hadoop2.6
    else
        export SPARK_HADOOP_DL=hadoop2.7
    fi
    # Tell the cloudera element to install only hdfs
    export DIB_CDH_HDFS_ONLY=1
@ -630,6 +635,7 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
    # Creating Ubuntu cloud image
    image_create ubuntu $ubuntu_image_name $ubuntu_elements_sequence
    unset SPARK_HADOOP_DL
    unset DIB_CLOUD_INIT_DATASOURCES
    unset DIB_HDFS_LIB_DIR
    unset DIB_CDH_HDFS_ONLY
--- a/elements/s3_hadoop/post-install.d/89-add-amazon-jar
+++ b/elements/s3_hadoop/post-install.d/89-add-amazon-jar
@ -1,60 +0,0 @@
 #!/bin/bash
 if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
    set -x
 fi
 set -eu
 set -o pipefail
 case "$plugin_type" in
    "vanilla" )
        HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
        HADOOP_ENV_SH_PATH="/opt/hadoop/etc/hadoop/hadoop-env.sh"
        SPARK_JARS_DIR_PATH="/opt/spark/jars"
        ;;
    "spark" )
        HADOOP_TOOLS_DIR_PATH="/usr/lib/hadoop/client"
        SPARK_JARS_DIR_PATH="/opt/spark/jars"
        ;;
    "cloudera" )
        echo -n "The s3_hadoop element is not supported on CDH,"
        echo " because the relevant libraries are already in the right place."
        exit 1
        ;;
    *)
        echo "The s3_hadoop element is only supported on Vanilla and Spark."
        exit 1
 esac
 # NOTE: By defintion, the Spark standalone plugin does not contain Hadoop in
 # its entirety. Therefore, there are no Hadoop-specific environment settings
 # available for modification.
 if [ "$plugin_type" != "spark" ]; then
    if [ -f "$HADOOP_ENV_SH_PATH" ]; then
        cat >> $HADOOP_ENV_SH_PATH <<EOF
 for f in $HADOOP_TOOLS_DIR_PATH/*.jar; do
    if [ "\$HADOOP_CLASSPATH" ]; then
        export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:\$f
    else
        export HADOOP_CLASSPATH=\$f
    fi
 done
 EOF
    else
        echo "Something went wrong: couldn't find Hadoop env settings."
        exit 1
    fi
 fi
 if [ -d "$SPARK_JARS_DIR_PATH" ]; then
    cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $SPARK_JARS_DIR_PATH
    chmod 0644 $SPARK_JARS_DIR_PATH/*aws*jar
 else
    # NOTE: In the case of Vanilla, the user may have disabled the Spark
    # element. So, check for the existence of the directory explicitly, but
    # crucially do do not consider it an error if the folder does not exist.
    if [ "$plugin_type" != "vanilla" ]; then
        echo "Something went wrong: couldn't find Spark installation."
        exit 1
    fi
 fi
--- a/elements/s3_hadoop/post-install.d/89-manipulate-s3
+++ b/elements/s3_hadoop/post-install.d/89-manipulate-s3
@ -0,0 +1,54 @@
 #!/bin/bash
 if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
    set -x
 fi
 set -eu
 set -o pipefail
 case "$plugin_type" in
    "vanilla" | "spark" )
        ;;
    "cloudera" )
        echo -n "The s3_hadoop element is not supported on CDH,"
        echo " because the relevant libraries need no manipulation."
        # NOTE: actually the above statement is only true on CDH>=5.9
        exit 1
        ;;
    *)
        # TODO: Investigate if some changes are in fact needed for HDP, MapR
        echo "The s3_hadoop element is only relevant to Vanilla and Spark."
        exit 1
 esac
 SPARK_JARS_DIR_PATH="/opt/spark/jars"
 HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
 HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib"
 if [ "$plugin_type" = "vanilla" ]; then
    if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" ]; then
        # These versions need a patched hadoop-aws jar
        wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-$DIB_HADOOP_VERSION.jar -O $HADOOP_TOOLS_DIR_PATH/hadoop-aws-$DIB_HADOOP_VERSION.jar
    fi
    # NOTE: It's easier just to copy, than to mess with YARN
    cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $HADOOP_COMMON_DIR_PATH
    if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" -o "$DIB_HADOOP_VERSION" = "2.8.2" ]; then
        # Hadoop-aws older than 2.9.0 needs these too
        cp $HADOOP_TOOLS_DIR_PATH/joda-time*.jar $HADOOP_COMMON_DIR_PATH
        # The following jars are also on-disk, but under the wrong namespace
        wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.5.3/jackson-core-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-core.jar
        wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.5.3/jackson-databind-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-databind.jar
        wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.5.3/jackson-annotations-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-annotations.jar
    fi
 fi
 # For both Spark and Vanilla plugins:
 # (The s3a driver in hadoop-aws 2.6.5 is too buggy to be redeemed)
 if [ "$SPARK_HADOOP_DL" != "hadoop2.6" ]; then
    # The hadoop-aws and aws-java-sdk libraries are missing here, but we
    # cannot copy them from the Hadoop folder on-disk due to
    # version/patching issues
    wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -O $SPARK_JARS_DIR_PATH/hadoop-aws.jar
    wget http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -O $SPARK_JARS_DIR_PATH/aws-java-sdk.jar
 fi
--- a/elements/spark/root.d/50-download-spark
+++ b/elements/spark/root.d/50-download-spark
@ -13,13 +13,14 @@ mkdir -p $tmp_dir
 if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
    # Check hadoop version
    # INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
    # Now the below is just a sanity check
    if [ -z "${SPARK_HADOOP_DL:-}" ]; then
        case "${DIB_CDH_VERSION:-}" in
            5.5)
-                SPARK_HADOOP_DL=hadoop2.6
+                SPARK_HADOOP_DL=hadoop2.7
            ;;
            5.11)
-                SPARK_HADOOP_DL=hadoop2.6
+                SPARK_HADOOP_DL=hadoop2.7
            ;;
            *)
                echo "WARNING: Cloudera CDH $DIB_CDH_VERSION not supported."