| #!/usr/bin/env bash |
| |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| # |
| # The Pig command script |
| # |
| # Environment Variables |
| # |
| # JAVA_HOME The java implementation to use. Overrides JAVA_HOME. |
| # |
| # PIG_CLASSPATH Extra Java CLASSPATH entries. |
| # |
| # PIG_USER_CLASSPATH_FIRST If set, add user provided classpath entries to |
| # the top of classpath instead of appending them. |
| # Default is unset, i.e. the classpath entries are |
| # placed normally at the end of a pre-defined classpath. |
| # |
| # HADOOP_HOME/HADOOP_PREFIX Environment HADOOP_HOME/HADOOP_PREFIX(0.20.205) |
| # |
| # HADOOP_CONF_DIR Hadoop conf dir |
| # |
| # PIG_HEAPSIZE The maximum amount of heap to use, in MB. |
| # Default is 1000. |
| # |
| # PIG_OPTS Extra Java runtime options. |
| # |
| # PIG_CONF_DIR Alternate conf dir. Default is ${PIG_HOME}/conf. |
| # |
| # HBASE_HOME Optionally, the HBase installation directory. |
| # Defaults to ${PIG_HOME}/share/hbase |
| # |
| # HBASE_CONF_DIR - Optionally, the HBase configuration to run against |
| # when using HBaseStorage. Defaults to ${HBASE_HOME}/conf |
| |
| cygwin=false |
| case "`uname`" in |
| CYGWIN*) cygwin=true;; |
| esac |
| debug=false |
| |
| remaining=() |
| includeHCatalog=""; |
| addJarString=-Dpig.additional.jars.uris\=; |
| additionalJars=""; |
| prevArgExecType=false; |
| isSparkMode=false; |
| isSparkLocalMode=false; |
| |
| #verify the execType is SPARK or SPARK_LOCAL or not |
| function processExecType(){ |
| execType=$1 |
| execTypeUpperCase=$(echo $execType |tr [a-z] [A-Z]) |
| if [[ "$execTypeUpperCase" == "SPARK" ]]; then |
| isSparkMode=true |
| elif [[ "$execTypeUpperCase" == "SPARK_LOCAL" ]]; then |
| isSparkLocalMode=true |
| fi |
| } |
| |
| # filter command line parameter |
| for f in "$@"; do |
| if [[ $f == "-secretDebugCmd" || $f == "-printCmdDebug" ]]; then |
| debug=true |
| elif [[ $f == "-useHCatalog" ]]; then |
| # if need to use hcatalog, we need to add the hcatalog and hive jars |
| # to the classpath and also include the hive configuration xml file |
| # for pig to work correctly with hcatalog |
| # because of PIG-2532, including the jars in the classpath is |
| # sufficient to ensure that they are registered as well |
| includeHCatalog=true; |
| elif [[ "$includeHCatalog" == "true" && $f == $addJarString* ]]; then |
| additionalJars=`echo $f | sed s/$addJarString//` |
| elif [[ "$f" == "-x" || "$f" == "-exectype" ]]; then |
| prevArgExecType=true; |
| remaining[${#remaining[@]}]="$f" |
| elif [[ "$prevArgExecType" == "true" ]]; then |
| prevArgExecType=false; |
| processExecType $f |
| remaining[${#remaining[@]}]="$f" |
| else |
| remaining[${#remaining[@]}]="$f" |
| fi |
| done |
| |
| # resolve links - $0 may be a softlink |
| this="${BASH_SOURCE-$0}" |
| |
| # convert relative path to absolute path |
| bin=$(cd -P -- "$(dirname -- "$this")">/dev/null && pwd -P) |
| script="$(basename -- "$this")" |
| this="$bin/$script" |
| |
| # the root of the Pig installation |
| if [ -z "$PIG_HOME" ]; then |
| export PIG_HOME=`dirname "$this"`/.. |
| fi |
| |
| if [ -z "$PIG_CONF_DIR" ]; then |
| if [ -f ${PIG_HOME}/conf/pig.properties ]; then |
| PIG_CONF_DIR=${PIG_HOME}/conf |
| fi |
| fi |
| |
| if [ -z "$PIG_CONF_DIR" ]; then |
| if [ -d /etc/pig ]; then |
| # if installed with rpm/deb package |
| PIG_CONF_DIR="/etc/pig" |
| fi |
| fi |
| |
| if [ -f "${PIG_CONF_DIR}/pig-env.sh" ]; then |
| . "${PIG_CONF_DIR}/pig-env.sh" |
| fi |
| |
| # some Java parameters |
| if [ "$JAVA_HOME" != "" ]; then |
| #echo "run java in $JAVA_HOME" |
| JAVA_HOME=$JAVA_HOME |
| fi |
| |
| if [ "$JAVA_HOME" = "" ]; then |
| echo "Error: JAVA_HOME is not set." |
| exit 1 |
| fi |
| |
| JAVA=$JAVA_HOME/bin/java |
| JAVA_HEAP_MAX=-Xmx1000m |
| |
| # check envvars which might override default args |
| if [ "$PIG_HEAPSIZE" != "" ]; then |
| JAVA_HEAP_MAX="-Xmx""$PIG_HEAPSIZE""m" |
| fi |
| |
| # CLASSPATH initially contains $PIG_CONF_DIR |
| CLASSPATH="${PIG_CONF_DIR}" |
| CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar |
| if [ "$includeHCatalog" == "true" ]; then |
| # need to provide the hcatalog jar file path as well as |
| # the location of the hive jars on which hcatalog depends |
| hiveMetaStoreJar=hive-metastore-*.jar |
| thriftJar=libthrift-*.jar |
| hiveExecJar=hive-exec-*.jar |
| fbJar=libfb303-*.jar |
| jdoECJar=jdo*-api-*.jar |
| slfJar=slf4j-api-*.jar |
| hbaseHiveJar=hive-hbase-handler-*.jar |
| if [ "$HIVE_HOME" == "" ]; then |
| if [ -d "/usr/lib/hive" ]; then |
| HIVE_HOME=/usr/lib/hive |
| else |
| echo "Please initialize HIVE_HOME" |
| exit -1 |
| fi |
| fi |
| |
| hiveMetaStoreVersion=`ls $HIVE_HOME/lib/$hiveMetaStoreJar` |
| thriftVersion=`ls $HIVE_HOME/lib/$thriftJar` |
| hiveExecVersion=`ls $HIVE_HOME/lib/$hiveExecJar` |
| fbJarVersion=`ls $HIVE_HOME/lib/$fbJar` |
| jdoECJarVersion=`ls $HIVE_HOME/lib/$jdoECJar` |
| slfJarVersion=`ls $HIVE_HOME/lib/$slfJar` |
| hbaseHiveVersion=`ls $HIVE_HOME/lib/$hbaseHiveJar` |
| |
| # hcatalog jar name for 0.4 and earlier |
| hcatJarOld=hcatalog-*.jar |
| # hcatalog jar name for 0.5 and newer |
| hcatJar=*hcatalog-core-*.jar |
| hbaseHCatJar=*hbase-storage-handler-*.jar |
| pigHCatJar=*hcatalog-pig-adapter-*.jar |
| if [ "$HCAT_HOME" == "" ]; then |
| if [ -d "/usr/lib/hcatalog" ]; then |
| HCAT_HOME=/usr/lib/hcatalog |
| elif [ -d "/usr/lib/hive-hcatalog" ]; then |
| HCAT_HOME=/usr/lib/hive-hcatalog |
| else |
| echo "Please initialize HCAT_HOME" |
| exit -1 |
| fi |
| fi |
| hcatJarPath=`ls $HCAT_HOME/share/hcatalog/$hcatJar` |
| # if hcat jar is not found may be we are on hcatalog 0.4 or older |
| if [ 'xx' == "x${hcatJarPath}x" ]; then |
| hcatJarPath=`ls $HCAT_HOME/share/hcatalog/$hcatJarOld | grep -v server` |
| fi |
| |
| # if we are using an older hcatalog version then the jar is on a different path |
| if [ -d "$HCAT_HOME/share/hcatalog/storage-handlers/hbase/lib" ]; then |
| # in 0.5 and newer we need to add multiple jars to the class path |
| hbaseHCatJarPath="$HCAT_HOME/share/hcatalog/storage-handlers/hbase/lib/*" |
| else |
| hbaseHCatJarPath=`ls $HCAT_HOME/lib/$hbaseHCatJar` |
| fi |
| |
| # get the pig storage handler jar |
| pigHCatJarPath=`ls $HCAT_HOME/share/hcatalog/${pigHCatJar}` |
| |
| HCAT_CLASSPATHS=$hiveMetaStoreVersion:$thriftVersion:$hiveExecVersion:$fbJarVersion:$jdoECJarVersion:$slfJarVersion:$hbaseHiveVersion:$hcatJarPath:$hbaseHCatJarPath:$pigHCatJarPath |
| ADDITIONAL_CLASSPATHS=file://$hiveMetaStoreVersion,file://$thriftVersion,file://$hiveExecVersion,file://$fbJarVersion,file://$jdoECJarVersion,file://$slfJarVersion,file://$hbaseHiveVersion,file://$hcatJarPath,file://$hbaseHCatJarPath,file://$pigHCatJarPath |
| if [ "$additionalJars" != "" ]; then |
| ADDITIONAL_CLASSPATHS=$ADDITIONAL_CLASSPATHS,$additionalJars |
| fi |
| CLASSPATH=${CLASSPATH}:$HCAT_CLASSPATHS:$HIVE_HOME/conf |
| fi |
| |
| # Add user-specified CLASSPATH entries via PIG_CLASSPATH |
| # If PIG_USER_CLASSPATH_FIRST is set, prepend the entries |
| if [ "$PIG_CLASSPATH" != "" ]; then |
| if [ "$PIG_USER_CLASSPATH_FIRST" == "" ]; then |
| CLASSPATH=${CLASSPATH}:${PIG_CLASSPATH} |
| else |
| CLASSPATH=${PIG_CLASSPATH}:${CLASSPATH} |
| fi |
| fi |
| |
| # add HADOOP_CONF_DIR |
| if [ "$HADOOP_CONF_DIR" != "" ]; then |
| CLASSPATH=${CLASSPATH}:${HADOOP_CONF_DIR} |
| fi |
| |
| # so that filenames w/ spaces are handled correctly in loops below |
| IFS= |
| |
| shopt -s extglob |
| shopt -s nullglob |
| |
| for f in $PIG_HOME/lib/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f; |
| done |
| |
| JYTHON_JAR=`echo ${PIG_HOME}/lib/jython*.jar` |
| |
| if [ -z "$JYTHON_JAR" ]; then |
| JYTHON_JAR=`echo $PIG_HOME/build/ivy/lib/Pig/jython*.jar` |
| if [ -n "$JYTHON_JAR" ]; then |
| CLASSPATH=${CLASSPATH}:$JYTHON_JAR |
| fi |
| fi |
| |
| JRUBY_JAR=`echo ${PIG_HOME}/lib/jruby-complete-*.jar` |
| |
| if [ -z "$JRUBY_JAR" ]; then |
| JRUBY_JAR=`echo $PIG_HOME/build/ivy/lib/Pig/jruby-complete-*.jar` |
| if [ -n "$JRUBY_JAR" ]; then |
| CLASSPATH=${CLASSPATH}:$JRUBY_JAR |
| fi |
| fi |
| |
| for f in $PIG_HOME/share/pig/lib/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f; |
| done |
| |
| # For Hadoop 0.23.0+ |
| # |
| #if [ -d "${PIG_HOME}/share/hadoop/common" ]; then |
| # for f in ${PIG_HOME}/share/hadoop/common/hadoop*.jar; do |
| # CLASSPATH=${CLASSPATH}:$f; |
| # done |
| #fi |
| # |
| #if [ -d "${PIG_HOME}/share/hadoop/hdfs" ]; then |
| # for f in ${PIG_HOME}/share/hadoop/hdfs/hadoop*.jar; do |
| # CLASSPATH=${CLASSPATH}:$f; |
| # done |
| #fi |
| # |
| #if [ -d "${PIG_HOME}/share/hadoop/mapreduce" ]; then |
| # for f in ${PIG_HOME}/share/hadoop/mapreduce/hadoop*.jar; do |
| # CLASSPATH=${CLASSPATH}:$f; |
| # done |
| #fi |
| |
| if which hadoop >/dev/null; then |
| HADOOP_BIN=`which hadoop` |
| fi |
| |
| if [[ -z "$HADOOP_BIN" && -n "$HADOOP_PREFIX" ]]; then |
| if [ -f $HADOOP_PREFIX/bin/hadoop ]; then |
| HADOOP_BIN=$HADOOP_PREFIX/bin/hadoop |
| fi |
| fi |
| |
| if [[ -z "$HADOOP_BIN" && -n "$HADOOP_HOME" && -d "$HADOOP_HOME" ]]; then |
| if [ -f $HADOOP_HOME/bin/hadoop ]; then |
| HADOOP_BIN=$HADOOP_HOME/bin/hadoop |
| fi |
| fi |
| |
| if [ -z "$HADOOP_BIN" ]; then |
| # if installed with rpm/deb package |
| if [ -f /usr/bin/hadoop ]; then |
| HADOOP_BIN=/usr/bin/hadoop |
| fi |
| fi |
| |
| # find out the HADOOP_HOME in order to find hadoop jar |
| # we use the name of hadoop jar to decide if user is using |
| # hadoop 1 or hadoop 2 |
| if [[ -z "$HADOOP_HOME" && -n "$HADOOP_PREFIX" ]]; then |
| HADOOP_HOME=$HADOOP_PREFIX |
| fi |
| |
| if [[ -z "$HADOOP_HOME" && -n "$HADOOP_BIN" ]]; then |
| HADOOP_HOME=`dirname $HADOOP_BIN`/.. |
| fi |
| |
| HADOOP_CORE_JAR=`echo ${HADOOP_HOME}/hadoop-core*.jar` |
| |
| if [ -z "$HADOOP_CORE_JAR" ]; then |
| HADOOP_VERSION=2 |
| else |
| echo "Pig requires Hadoop 2 to be present in HADOOP_HOME (currently: $HADOOP_HOME). Please install Hadoop 2.x" |
| exit 1 |
| fi |
| |
| # if using HBase, likely want to include HBase jars and config |
| HBH=${HBASE_HOME:-"${PIG_HOME}/share/hbase"} |
| if [ -d "${HBH}" ]; then |
| for f in ${HBH}/hbase-*.jar; do |
| CLASSPATH=${CLASSPATH}:$f |
| done |
| for f in ${HBH}/lib/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f |
| done |
| HBASE_CONF_DIR=${HBASE_CONF_DIR:-"${HBH}/conf"} |
| fi |
| if [ -n "$HBASE_CONF_DIR" ] && [ -d "$HBASE_CONF_DIR" ]; then |
| CLASSPATH=$HBASE_CONF_DIR:$CLASSPATH |
| fi |
| |
| if [ -d "${PIG_HOME}/etc/hadoop" ]; then |
| CLASSPATH=${CLASSPATH}:${PIG_HOME}/etc/hadoop; |
| fi |
| |
| # locate ZooKeeper |
| ZKH=${ZOOKEEPER_HOME:-"${PIG_HOME}/share/zookeeper"} |
| if [ -d "$ZKH" ] ; then |
| for f in ${ZKH}/zookeeper-*.jar; do |
| CLASSPATH=${CLASSPATH}:$f |
| done |
| fi |
| |
| # default log directory & file |
| if [ "$PIG_LOG_DIR" = "" ]; then |
| PIG_LOG_DIR="$PIG_HOME/logs" |
| fi |
| if [ "$PIG_LOGFILE" = "" ]; then |
| PIG_LOGFILE='pig.log' |
| fi |
| |
| # cygwin path translation |
| if $cygwin; then |
| CLASSPATH=`cygpath -p -w "$CLASSPATH"` |
| PIG_HOME=`cygpath -d "$PIG_HOME"` |
| PIG_LOG_DIR=`cygpath -d "$PIG_LOG_DIR"` |
| fi |
| |
| # restore ordinary behaviour |
| unset IFS |
| |
| PIG_OPTS="$PIG_OPTS -Dpig.log.dir=$PIG_LOG_DIR" |
| PIG_OPTS="$PIG_OPTS -Dpig.log.file=$PIG_LOGFILE" |
| PIG_OPTS="$PIG_OPTS -Dpig.home.dir=$PIG_HOME" |
| if [ "$includeHCatalog" == "true" ]; then |
| addJars=`echo $PIG_OPTS | awk '{ for (i=1; i<=NF; i++) print $i; }' | grep "\-Dpig.additional.jars.uris=" | sed s/-Dpig.additional.jars.uris=//` |
| if [ "$addJars" != "" ]; then |
| ADDITIONAL_CLASSPATHS=$addJars,$ADDITIONAL_CLASSPATHS |
| PIG_OPTS=`echo $PIG_OPTS | sed 's/-Dpig.additional.jars.uris=[^ ]*//'` |
| fi |
| PIG_OPTS="$PIG_OPTS -Dpig.additional.jars.uris=$ADDITIONAL_CLASSPATHS" |
| fi |
| |
| ################# ADDING SPARK DEPENDENCIES ################## |
| # For spark_local mode: |
| if [ "$isSparkLocalMode" == "true" ]; then |
| #SPARK_MASTER is forced to be "local" in spark_local mode |
| SPARK_MASTER="local" |
| for f in $PIG_HOME/lib/spark/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f; |
| done |
| fi |
| |
| # For spark mode: |
| # Please specify SPARK_HOME first so that we can locate $SPARK_HOME/lib/spark-assembly*.jar, |
| # we will add spark-assembly*.jar to the classpath. |
| if [ "$isSparkMode" == "true" ]; then |
| if [ -z "$SPARK_HOME" ]; then |
| echo "Error: SPARK_HOME is not set!" |
| exit 1 |
| fi |
| |
| # Please specify SPARK_JAR which is the hdfs path of spark-assembly*.jar to allow YARN to cache spark-assembly*.jar on nodes so that it doesn't need to be distributed each time an application runs. |
| if [ -z "$SPARK_JAR" ]; then |
| echo "Error: SPARK_JAR is not set, SPARK_JAR stands for the hdfs location of spark-assembly*.jar. This allows YARN to cache spark-assembly*.jar on nodes so that it doesn't need to be distributed each time an application runs." |
| exit 1 |
| fi |
| |
| if [ -n "$SPARK_HOME" ]; then |
| echo "Using Spark Home: " ${SPARK_HOME} |
| SPARK_ASSEMBLY_JAR=`ls ${SPARK_HOME}/lib/spark-assembly*` |
| CLASSPATH=${CLASSPATH}:$SPARK_ASSEMBLY_JAR |
| fi |
| fi |
| |
| #spark-assembly.jar contains jcl-over-slf4j which would create a LogFactory implementation that is incompatible |
| if [ "$isSparkMode" == "true" ]; then |
| PIG_OPTS="$PIG_OPTS -Dorg.apache.commons.logging.LogFactory=org.apache.commons.logging.impl.LogFactoryImpl" |
| fi |
| ################# ADDING SPARK DEPENDENCIES ################## |
| |
| # run it |
| if [ -n "$HADOOP_BIN" ]; then |
| if [ "$debug" == "true" ]; then |
| echo "Find hadoop at $HADOOP_BIN" |
| fi |
| |
| PIG_JAR=`echo $PIG_HOME/pig*-core-h${HADOOP_VERSION}.jar` |
| |
| # for deb/rpm package, add pig jar in /usr/share/pig |
| if [ -z "$PIG_JAR" ]; then |
| PIG_JAR=`echo $PIG_HOME/share/pig/pig*-core-h${HADOOP_VERSION}.jar` |
| fi |
| |
| if [ -n "$PIG_JAR" ]; then |
| CLASSPATH=${CLASSPATH}:$PIG_JAR |
| else |
| echo "Cannot locate pig-core-h${HADOOP_VERSION}.jar. do 'ant jar', and try again" |
| exit 1 |
| fi |
| |
| for f in $PIG_HOME/lib/h${HADOOP_VERSION}/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f; |
| done |
| |
| export HADOOP_CLASSPATH=$CLASSPATH:$HADOOP_CLASSPATH |
| export HADOOP_CLIENT_OPTS="$JAVA_HEAP_MAX $PIG_OPTS $HADOOP_CLIENT_OPTS" |
| if [ "$debug" == "true" ]; then |
| echo "dry run:" |
| echo "HADOOP_CLASSPATH: $HADOOP_CLASSPATH" |
| echo "HADOOP_OPTS: $HADOOP_OPTS" |
| echo "HADOOP_CLIENT_OPTS: $HADOOP_CLIENT_OPTS" |
| echo "$HADOOP_BIN" jar "$PIG_JAR" "${remaining[@]}" |
| echo |
| else |
| exec "$HADOOP_BIN" jar "$PIG_JAR" "${remaining[@]}" |
| fi |
| else |
| # use bundled hadoop to run local mode |
| PIG_JAR=`echo $PIG_HOME/pig*-core-h2.jar` |
| |
| if [ -n "$PIG_JAR" ]; then |
| CLASSPATH="${CLASSPATH}:$PIG_JAR" |
| else |
| echo "Cannot locate pig.jar. do 'ant jar', and try again" |
| exit 1 |
| fi |
| |
| for f in $PIG_HOME/lib/h2/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f; |
| done |
| |
| # Add bundled hadoop jars |
| for f in $PIG_HOME/lib/hadoop2-runtime/*.jar; do |
| CLASSPATH=${CLASSPATH}:$f; |
| done |
| |
| if [ "$debug" == "true" ]; then |
| echo "Cannot find local hadoop installation, using bundled `java -cp $CLASSPATH org.apache.hadoop.util.VersionInfo | head -1`" |
| fi |
| |
| CLASS=org.apache.pig.Main |
| if [ "$debug" == "true" ]; then |
| echo "dry run:" |
| echo "$JAVA" $JAVA_HEAP_MAX $PIG_OPTS -classpath "$CLASSPATH" $CLASS "${remaining[@]}" |
| echo |
| else |
| exec "$JAVA" $JAVA_HEAP_MAX $PIG_OPTS -classpath "$CLASSPATH" $CLASS "${remaining[@]}" |
| fi |
| fi |
| shopt -u nullglob |
| shopt -u extglob |