dev/make-distribution.sh - spark - Git at Google

 #!/usr/bin/env bash

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 #
 # Script to create a binary distribution for easy deploys of Spark.
 # The distribution directory defaults to dist/ but can be overridden below.
 # The distribution contains fat (assembly) jars that include the Scala library,
 # so it is completely self contained.
 # It does not contain source or *.class files.

 set -o pipefail
 set -e
 set -x

 # Figure out where the Spark framework is installed
 SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)"
 DISTDIR="$SPARK_HOME/dist"

 MAKE_TGZ=false
 MAKE_PIP=false
 MAKE_R=false
 NAME=none
 MVN="$SPARK_HOME/build/mvn"

 function exit_with_usage {
   set +x
   echo "make-distribution.sh - tool for making binary distributions of Spark"
   echo ""
   echo "usage:"
   cl_options="[--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>]"
   echo "make-distribution.sh $cl_options <maven build options>"
   echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
   exit 1
 }

 # Parse arguments
 while (( "$#" )); do
   case $1 in
     --tgz)
       MAKE_TGZ=true
       ;;
     --pip)
       MAKE_PIP=true
       ;;
     --r)
       MAKE_R=true
       ;;
     --mvn)
       MVN="$2"
       shift
       ;;
     --name)
       NAME="$2"
       shift
       ;;
     --help)
       exit_with_usage
       ;;
     --*)
       echo "Error: $1 is not supported"
       exit_with_usage
       ;;
     -*)
       break
       ;;
     *)
       echo "Error: $1 is not supported"
       exit_with_usage
       ;;
   esac
   shift
 done

 if [ -z "$JAVA_HOME" ]; then
   # Fall back on JAVA_HOME from rpm, if found
   if [ $(command -v  rpm) ]; then
     RPM_JAVA_HOME="$(rpm -E %java_home 2>/dev/null)"
     if [ "$RPM_JAVA_HOME" != "%java_home" ]; then
       JAVA_HOME="$RPM_JAVA_HOME"
       echo "No JAVA_HOME set, proceeding with '$JAVA_HOME' learned from rpm"
     fi
   fi

   if [ -z "$JAVA_HOME" ]; then
     if [ `command -v java` ]; then
       # If java is in /usr/bin/java, we want /usr
       JAVA_HOME="$(dirname $(dirname $(which java)))"
     fi
   fi
 fi

 if [ -z "$JAVA_HOME" ]; then
   echo "Error: JAVA_HOME is not set, cannot proceed."
   exit -1
 fi

 if [ $(command -v git) ]; then
     GITREV=$(git rev-parse --short HEAD 2>/dev/null || :)
     if [ ! -z "$GITREV" ]; then
         GITREVSTRING=" (git revision $GITREV)"
     fi
     unset GITREV
 fi


 if [ ! "$(command -v "$MVN")" ] ; then
     echo -e "Could not locate Maven command: '$MVN'."
     echo -e "Specify the Maven command with the --mvn flag"
     exit -1;
 fi

 VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ \
     | grep -v "INFO"\
     | grep -v "WARNING"\
     | tail -n 1)
 SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ \
     | grep -v "INFO"\
     | grep -v "WARNING"\
     | tail -n 1)
 SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ \
     | grep -v "INFO"\
     | grep -v "WARNING"\
     | tail -n 1)
 SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ \
     | grep -v "INFO"\
     | grep -v "WARNING"\
     | grep -F --count "<id>hive</id>";\
     # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
     # because we use "set -o pipefail"
     echo -n)

 if [ "$NAME" == "none" ]; then
   NAME=$SPARK_HADOOP_VERSION
 fi

 echo "Spark version is $VERSION"

 if [ "$MAKE_TGZ" == "true" ]; then
   echo "Making spark-$VERSION-bin-$NAME.tgz"
 else
   echo "Making distribution for Spark $VERSION in '$DISTDIR'..."
 fi

 # Build uber fat JAR
 cd "$SPARK_HOME"

 export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m}"

 # Store the command as an array because $MVN variable might have spaces in it.
 # Normal quoting tricks don't work.
 # See: http://mywiki.wooledge.org/BashFAQ/050
 BUILD_COMMAND=("$MVN" clean package \
     -DskipTests \
     -Dmaven.javadoc.skip=true \
     -Dmaven.scaladoc.skip=true \
     -Dmaven.source.skip \
     -Dcyclonedx.skip=true \
     $@)

 # Actually build the jar
 echo -e "\nBuilding with..."
 echo -e "\$ ${BUILD_COMMAND[@]}\n"

 "${BUILD_COMMAND[@]}"

 # Make directories
 rm -rf "$DISTDIR"
 mkdir -p "$DISTDIR/jars"
 echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
 echo "Build flags: $@" >> "$DISTDIR/RELEASE"

 # Copy jars
 cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"

 # Only create the hive-jackson directory if they exist.
 if [ -f "$DISTDIR"/jars/jackson-core-asl-1.9.13.jar ]; then
   for f in "$DISTDIR"/jars/jackson-*-asl-*.jar; do
     mkdir -p "$DISTDIR"/hive-jackson
     mv $f "$DISTDIR"/hive-jackson/
   done
 fi

 # Only create the yarn directory if the yarn artifacts were built.
 if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
   mkdir "$DISTDIR/yarn"
   cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn"
 fi

 # Only create and copy the dockerfiles directory if the kubernetes artifacts were built.
 if [ -d "$SPARK_HOME"/resource-managers/kubernetes/core/target/ ]; then
   mkdir -p "$DISTDIR/kubernetes/"
   cp -a "$SPARK_HOME"/resource-managers/kubernetes/docker/src/main/dockerfiles "$DISTDIR/kubernetes/"
   cp -a "$SPARK_HOME"/resource-managers/kubernetes/integration-tests/tests "$DISTDIR/kubernetes/"
 fi

 # Copy examples and dependencies
 mkdir -p "$DISTDIR/examples/jars"
 cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars"

 # Deduplicate jars that have already been packaged as part of the main Spark dependencies.
 for f in "$DISTDIR"/examples/jars/*; do
   name=$(basename "$f")
   if [ -f "$DISTDIR/jars/$name" ]; then
     rm "$DISTDIR/examples/jars/$name"
   fi
 done

 # Copy example sources (needed for python and SQL)
 mkdir -p "$DISTDIR/examples/src/main"
 cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/"

 # Copy license and ASF files
 if [ -e "$SPARK_HOME/LICENSE-binary" ]; then
   cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE"
   cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses"
   cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE"
 else
   echo "Skipping copying LICENSE files"
 fi

 if [ -e "$SPARK_HOME/CHANGES.txt" ]; then
   cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR"
 fi

 # Copy data files
 cp -r "$SPARK_HOME/data" "$DISTDIR"

 # Make pip package
 if [ "$MAKE_PIP" == "true" ]; then
   echo "Building python distribution package"
   pushd "$SPARK_HOME/python" > /dev/null
   # Delete the egg info file if it exists, this can cache older setup files.
   rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
   python3 packaging/classic/setup.py sdist
   python3 packaging/connect/setup.py sdist
   popd > /dev/null
 else
   echo "Skipping building python distribution package"
 fi

 # Make R package - this is used for both CRAN release and packing R layout into distribution
 if [ "$MAKE_R" == "true" ]; then
   echo "Building R source package"
   R_PACKAGE_VERSION=`grep Version "$SPARK_HOME/R/pkg/DESCRIPTION" | awk '{print $NF}'`
   pushd "$SPARK_HOME/R" > /dev/null
   # Build source package and run full checks
   # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
   NO_TESTS=1 "$SPARK_HOME/R/check-cran.sh"

   # Move R source package to match the Spark release version if the versions are not the same.
   # NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file
   if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then
     mv "$SPARK_HOME/R/SparkR_$R_PACKAGE_VERSION.tar.gz" "$SPARK_HOME/R/SparkR_$VERSION.tar.gz"
   fi

   # Install source package to get it to generate vignettes rds files, etc.
   VERSION=$VERSION "$SPARK_HOME/R/install-source-package.sh"
   popd > /dev/null
 else
   echo "Skipping building R source package"
 fi

 # Copy other things
 mkdir "$DISTDIR/conf"
 cp "$SPARK_HOME"/conf/*.template "$DISTDIR/conf"
 cp "$SPARK_HOME/README.md" "$DISTDIR"
 cp -r "$SPARK_HOME/bin" "$DISTDIR"
 cp -r "$SPARK_HOME/python" "$DISTDIR"

 # Remove the python distribution from dist/ if we built it
 if [ "$MAKE_PIP" == "true" ]; then
   rm -f "$DISTDIR"/python/dist/pyspark-*.tar.gz
 fi

 cp -r "$SPARK_HOME/sbin" "$DISTDIR"
 # Copy SparkR if it exists
 if [ -d "$SPARK_HOME/R/lib/SparkR" ]; then
   mkdir -p "$DISTDIR/R/lib"
   cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR/R/lib"
   cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR/R/lib"
 fi

 if [ "$MAKE_TGZ" == "true" ]; then
   TARDIR_NAME=spark-$VERSION-bin-$NAME
   TARDIR="$SPARK_HOME/$TARDIR_NAME"
   rm -rf "$TARDIR"
   cp -r "$DISTDIR" "$TARDIR"
   TAR="tar"
   if [ "$(uname -s)" = "Darwin" ]; then
     TAR="tar --no-mac-metadata --no-xattrs --no-fflags"
   fi
   $TAR -czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME"
   rm -rf "$TARDIR"
 fi
	#!/usr/bin/env bash

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	#
	# Script to create a binary distribution for easy deploys of Spark.
	# The distribution directory defaults to dist/ but can be overridden below.
	# The distribution contains fat (assembly) jars that include the Scala library,
	# so it is completely self contained.
	# It does not contain source or *.class files.

	set -o pipefail
	set -e
	set -x

	# Figure out where the Spark framework is installed
	SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)"
	DISTDIR="$SPARK_HOME/dist"

	MAKE_TGZ=false
	MAKE_PIP=false
	MAKE_R=false
	NAME=none
	MVN="$SPARK_HOME/build/mvn"

	function exit_with_usage {
	set +x
	echo "make-distribution.sh - tool for making binary distributions of Spark"
	echo ""
	echo "usage:"
	cl_options="[--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>]"
	echo "make-distribution.sh $cl_options <maven build options>"
	echo "See Spark's \"Building Spark\" doc for correct Maven options."
	echo ""
	exit 1
	}

	# Parse arguments
	while (( "$#" )); do
	case $1 in
	--tgz)
	MAKE_TGZ=true
	;;
	--pip)
	MAKE_PIP=true
	;;
	--r)
	MAKE_R=true
	;;
	--mvn)
	MVN="$2"
	shift
	;;
	--name)
	NAME="$2"
	shift
	;;
	--help)
	exit_with_usage
	;;
	--*)
	echo "Error: $1 is not supported"
	exit_with_usage
	;;
	-*)
	break
	;;
	*)
	echo "Error: $1 is not supported"
	exit_with_usage
	;;
	esac
	shift
	done

	if [ -z "$JAVA_HOME" ]; then
	# Fall back on JAVA_HOME from rpm, if found
	if [ $(command -v rpm) ]; then
	RPM_JAVA_HOME="$(rpm -E %java_home 2>/dev/null)"
	if [ "$RPM_JAVA_HOME" != "%java_home" ]; then
	JAVA_HOME="$RPM_JAVA_HOME"
	echo "No JAVA_HOME set, proceeding with '$JAVA_HOME' learned from rpm"
	fi
	fi

	if [ -z "$JAVA_HOME" ]; then
	if [ `command -v java` ]; then
	# If java is in /usr/bin/java, we want /usr
	JAVA_HOME="$(dirname $(dirname $(which java)))"
	fi
	fi
	fi

	if [ -z "$JAVA_HOME" ]; then
	echo "Error: JAVA_HOME is not set, cannot proceed."
	exit -1
	fi

	if [ $(command -v git) ]; then
	GITREV=$(git rev-parse --short HEAD 2>/dev/null \|\| :)
	if [ ! -z "$GITREV" ]; then
	GITREVSTRING=" (git revision $GITREV)"
	fi
	unset GITREV
	fi


	if [ ! "$(command -v "$MVN")" ] ; then
	echo -e "Could not locate Maven command: '$MVN'."
	echo -e "Specify the Maven command with the --mvn flag"
	exit -1;
	fi

	VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ \
	\| grep -v "INFO"\
	\| grep -v "WARNING"\
	\| tail -n 1)
	SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ \
	\| grep -v "INFO"\
	\| grep -v "WARNING"\
	\| tail -n 1)
	SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ \
	\| grep -v "INFO"\
	\| grep -v "WARNING"\
	\| tail -n 1)
	SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ \
	\| grep -v "INFO"\
	\| grep -v "WARNING"\
	\| grep -F --count "<id>hive</id>";\
	# Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
	# because we use "set -o pipefail"
	echo -n)

	if [ "$NAME" == "none" ]; then
	NAME=$SPARK_HADOOP_VERSION
	fi

	echo "Spark version is $VERSION"

	if [ "$MAKE_TGZ" == "true" ]; then
	echo "Making spark-$VERSION-bin-$NAME.tgz"
	else
	echo "Making distribution for Spark $VERSION in '$DISTDIR'..."
	fi

	# Build uber fat JAR
	cd "$SPARK_HOME"

	export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m}"

	# Store the command as an array because $MVN variable might have spaces in it.
	# Normal quoting tricks don't work.
	# See: http://mywiki.wooledge.org/BashFAQ/050
	BUILD_COMMAND=("$MVN" clean package \
	-DskipTests \
	-Dmaven.javadoc.skip=true \
	-Dmaven.scaladoc.skip=true \
	-Dmaven.source.skip \
	-Dcyclonedx.skip=true \
	$@)

	# Actually build the jar
	echo -e "\nBuilding with..."
	echo -e "\$ ${BUILD_COMMAND[@]}\n"

	"${BUILD_COMMAND[@]}"

	# Make directories
	rm -rf "$DISTDIR"
	mkdir -p "$DISTDIR/jars"
	echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
	echo "Build flags: $@" >> "$DISTDIR/RELEASE"

	# Copy jars
	cp "$SPARK_HOME"/assembly/target/scala/jars/ "$DISTDIR/jars/"

	# Only create the hive-jackson directory if they exist.
	if [ -f "$DISTDIR"/jars/jackson-core-asl-1.9.13.jar ]; then
	for f in "$DISTDIR"/jars/jackson--asl-.jar; do
	mkdir -p "$DISTDIR"/hive-jackson
	mv $f "$DISTDIR"/hive-jackson/
	done
	fi

	# Only create the yarn directory if the yarn artifacts were built.
	if [ -f "$SPARK_HOME"/common/network-yarn/target/scala/spark--yarn-shuffle.jar ]; then
	mkdir "$DISTDIR/yarn"
	cp "$SPARK_HOME"/common/network-yarn/target/scala/spark--yarn-shuffle.jar "$DISTDIR/yarn"
	fi

	# Only create and copy the dockerfiles directory if the kubernetes artifacts were built.
	if [ -d "$SPARK_HOME"/resource-managers/kubernetes/core/target/ ]; then
	mkdir -p "$DISTDIR/kubernetes/"
	cp -a "$SPARK_HOME"/resource-managers/kubernetes/docker/src/main/dockerfiles "$DISTDIR/kubernetes/"
	cp -a "$SPARK_HOME"/resource-managers/kubernetes/integration-tests/tests "$DISTDIR/kubernetes/"
	fi

	# Copy examples and dependencies
	mkdir -p "$DISTDIR/examples/jars"
	cp "$SPARK_HOME"/examples/target/scala/jars/ "$DISTDIR/examples/jars"

	# Deduplicate jars that have already been packaged as part of the main Spark dependencies.
	for f in "$DISTDIR"/examples/jars/*; do
	name=$(basename "$f")
	if [ -f "$DISTDIR/jars/$name" ]; then
	rm "$DISTDIR/examples/jars/$name"
	fi
	done

	# Copy example sources (needed for python and SQL)
	mkdir -p "$DISTDIR/examples/src/main"
	cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/"

	# Copy license and ASF files
	if [ -e "$SPARK_HOME/LICENSE-binary" ]; then
	cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE"
	cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses"
	cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE"
	else
	echo "Skipping copying LICENSE files"
	fi

	if [ -e "$SPARK_HOME/CHANGES.txt" ]; then
	cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR"
	fi

	# Copy data files
	cp -r "$SPARK_HOME/data" "$DISTDIR"

	# Make pip package
	if [ "$MAKE_PIP" == "true" ]; then
	echo "Building python distribution package"
	pushd "$SPARK_HOME/python" > /dev/null
	# Delete the egg info file if it exists, this can cache older setup files.
	rm -rf pyspark.egg-info \|\| echo "No existing egg info file, skipping deletion"
	python3 packaging/classic/setup.py sdist
	python3 packaging/connect/setup.py sdist
	popd > /dev/null
	else
	echo "Skipping building python distribution package"
	fi

	# Make R package - this is used for both CRAN release and packing R layout into distribution
	if [ "$MAKE_R" == "true" ]; then
	echo "Building R source package"
	R_PACKAGE_VERSION=`grep Version "$SPARK_HOME/R/pkg/DESCRIPTION" \| awk '{print $NF}'`
	pushd "$SPARK_HOME/R" > /dev/null
	# Build source package and run full checks
	# Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
	NO_TESTS=1 "$SPARK_HOME/R/check-cran.sh"

	# Move R source package to match the Spark release version if the versions are not the same.
	# NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file
	if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then
	mv "$SPARK_HOME/R/SparkR_$R_PACKAGE_VERSION.tar.gz" "$SPARK_HOME/R/SparkR_$VERSION.tar.gz"
	fi

	# Install source package to get it to generate vignettes rds files, etc.
	VERSION=$VERSION "$SPARK_HOME/R/install-source-package.sh"
	popd > /dev/null
	else
	echo "Skipping building R source package"
	fi

	# Copy other things
	mkdir "$DISTDIR/conf"
	cp "$SPARK_HOME"/conf/*.template "$DISTDIR/conf"
	cp "$SPARK_HOME/README.md" "$DISTDIR"
	cp -r "$SPARK_HOME/bin" "$DISTDIR"
	cp -r "$SPARK_HOME/python" "$DISTDIR"

	# Remove the python distribution from dist/ if we built it
	if [ "$MAKE_PIP" == "true" ]; then
	rm -f "$DISTDIR"/python/dist/pyspark-*.tar.gz
	fi

	cp -r "$SPARK_HOME/sbin" "$DISTDIR"
	# Copy SparkR if it exists
	if [ -d "$SPARK_HOME/R/lib/SparkR" ]; then
	mkdir -p "$DISTDIR/R/lib"
	cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR/R/lib"
	cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR/R/lib"
	fi

	if [ "$MAKE_TGZ" == "true" ]; then
	TARDIR_NAME=spark-$VERSION-bin-$NAME
	TARDIR="$SPARK_HOME/$TARDIR_NAME"
	rm -rf "$TARDIR"
	cp -r "$DISTDIR" "$TARDIR"
	TAR="tar"
	if [ "$(uname -s)" = "Darwin" ]; then
	TAR="tar --no-mac-metadata --no-xattrs --no-fflags"
	fi
	$TAR -czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME"
	rm -rf "$TARDIR"
	fi