blob: 5bc6748f3cdd4a427d442060ffc8e312224019ec [file] [log] [blame]
#! /usr/bin/env bash
# Copyright 2015 Webindex authors (see AUTHORS)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
export WI_HOME=$( cd "$( dirname "$BIN_DIR" )" && pwd )
if [ -f $WI_HOME/conf/webindex-env.sh ]; then
. $WI_HOME/conf/webindex-env.sh
else
. $WI_HOME/conf/webindex-env.sh.example
fi
: ${HADOOP_CONF_DIR?"HADOOP_CONF_DIR must be set in bash env or conf/webindex-env.sh"}
if [ ! -d $HADOOP_CONF_DIR ]; then
echo "HADOOP_CONF_DIR=$HADOOP_CONF_DIR does not exist"
exit 1
fi
: ${FLUO_HOME?"FLUO_HOME must be set in bash env or conf/webindex-env.sh"}
if [ ! -d $FLUO_HOME ]; then
echo "FLUO_HOME=$FLUO_HOME does not exist"
exit 1
fi
mkdir -p $WI_HOME/logs
export DATA_CONFIG=$WI_HOME/conf/data.yml
if [ ! -f $DATA_CONFIG ]; then
export DATA_CONFIG=$WI_HOME/conf/data.yml.example
if [ ! -f $DATA_CONFIG ]; then
echo "Could not find data.yml or data.yml.example in $WI_HOME/conf"
exit 1
fi
echo "Using default config at $DATA_CONFIG"
fi
function get_prop {
echo "`grep $1 $DATA_CONFIG | cut -d ' ' -f 2`"
}
: ${WI_EXECUTOR_INSTANCES?"WI_EXECUTOR_INSTANCES must be set in bash env or conf/webindex-env.sh"}
: ${WI_EXECUTOR_MEMORY?"WI_EXECUTOR_MEMORY must be set in bash env or conf/webindex-env.sh"}
export COMMON_SPARK_OPTS="--master yarn-client --num-executors $WI_EXECUTOR_INSTANCES --executor-memory $WI_EXECUTOR_MEMORY"
COMMAND_LOGFILE=$WI_HOME/logs/$1_`date +%s`.log
case "$1" in
getpaths)
PATHS_DIR=$WI_HOME/paths
mkdir -p $PATHS_DIR
PATHS_FILE="$2".wat.paths
if [ ! -f $PATHS_DIR/$PATHS_FILE ]; then
rm -f $PATHS_DIR/wat.paths.gz
PATHS_URL=https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-$2/wat.paths.gz
if [[ `wget -S --spider $PATHS_URL 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then
wget -P $PATHS_DIR $PATHS_URL
gzip -d $PATHS_DIR/wat.paths.gz
mv $PATHS_DIR/wat.paths $PATHS_DIR/$PATHS_FILE
echo "Downloaded paths file to $PATHS_DIR/$PATHS_FILE"
else
echo "Crawl paths file for date $2 does not exist at $PATHS_URL"
exit 1
fi
else
echo "Crawl paths file already exists at $PATHS_DIR/$PATHS_FILE"
fi
;;
copy)
if [ "$#" -lt 4 -o "$#" -gt 5 ]; then
echo "Usage: webindex copy <DATE> <RANGE> <DEST> [-fg]"
exit 1
fi
. $BIN_DIR/impl/base.sh
COMMAND="$SPARK_SUBMIT --class webindex.data.Copy $COMMON_SPARK_OPTS \
$WI_DATA_DEP_JAR $WI_HOME/paths/"$2".wat.paths $3 $4"
if [ "$5" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started copy. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
init)
if [ "$#" -lt 1 -o "$#" -gt 3 ]; then
echo "Usage: webindex init <SRC> [-fg]"
exit 1
fi
. $BIN_DIR/impl/base.sh
COMMAND="$BIN_DIR/impl/init.sh $2"
if [ "$2" == "-fg" ]; then
COMMAND="$BIN_DIR/impl/init.sh"
fi
if [ "$2" != "-fg" -a "$3" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started init. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
load-hdfs)
if [ "$#" -lt 2 -o "$#" -gt 3 ]; then
echo "Usage: webindex load-hdfs <SRC> [-fg]"
exit 1
fi
. $BIN_DIR/impl/base.sh
FLUO_PROPS=$FLUO_HOME/apps/`get_prop fluoApp`/conf/fluo.properties
if [ ! -f $FLUO_PROPS ]; then
echo "Fluo properties file must exist at $FLUO_PROPS"
exit 1
fi
COMMAND="$SPARK_SUBMIT --class webindex.data.LoadHdfs $COMMON_SPARK_OPTS \
--files $FLUO_PROPS $WI_DATA_DEP_JAR $2"
if [ "$3" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started load-hdfs. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
load-s3)
if [ "$#" -lt 3 -o "$#" -gt 4 ]; then
echo "Usage: webindex load-s3 <DATE> <RANGE> [-fg]"
exit 1
fi
. $BIN_DIR/impl/base.sh
FLUO_PROPS=$FLUO_HOME/apps/`get_prop fluoApp`/conf/fluo.properties
if [ ! -f $FLUO_PROPS ]; then
echo "Fluo properties file must exist at $FLUO_PROPS"
exit 1
fi
COMMAND="$SPARK_SUBMIT --class webindex.data.LoadS3 $COMMON_SPARK_OPTS \
--files $FLUO_PROPS $WI_DATA_DEP_JAR $WI_HOME/paths/"$2".wat.paths $3"
if [ "$4" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started load-s3. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
test-parser)
if [ "$#" -lt 3 -o "$#" -gt 4 ]; then
echo "Usage: webindex test-parser <DATE> <RANGE> [-fg]"
exit 1
fi
. $BIN_DIR/impl/base.sh
COMMAND="$SPARK_SUBMIT --class webindex.data.TestParser $COMMON_SPARK_OPTS \
$WI_DATA_DEP_JAR $WI_HOME/paths/"$2".wat.paths $3"
if [ "$4" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started data-verify. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
ui)
pkill -9 -f webindex-ui
WI_UI_JAR=$WI_HOME/modules/ui/target/webindex-ui-0.0.1-SNAPSHOT.jar
if [ ! -f $WI_UI_JAR ]; then
cd $WI_HOME/modules/ui
mvn clean install -DskipTests
fi
DROPWIZARD_CONFIG=""
if [ -f $WI_HOME/conf/dropwizard.yml ]; then
DROPWIZARD_CONFIG=$WI_HOME/conf/dropwizard.yml
echo "Running with dropwizard config at $DROPWIZARD_CONFIG"
fi
COMMAND="java -jar $WI_UI_JAR server $DROPWIZARD_CONFIG"
if [ "$2" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started UI. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
splits)
. $BIN_DIR/impl/base.sh
COMMAND="$SPARK_SUBMIT --class webindex.data.CalcSplits \
$COMMON_SPARK_OPTS \
--conf spark.shuffle.service.enabled=true \
$WI_DATA_DEP_JAR $2"
if [ "$2" != "-fg" ]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started splits calculation. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
test)
COMMAND="$BIN_DIR/impl/test.sh ${@:2}"
if [[ $@ != *"-fg"* ]]; then
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started webindex test ${@:2}. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
test-id)
if [ "$#" -gt 3 ]; then
echo "Usage: webindex test-id <ID> [-fg]"
exit 1
fi
if [ -z "$2" ]; then
echo "Available tests:"
echo "----------------"
cat $WI_HOME/conf/webindex-tests.txt
exit 1
fi
TEST_ARGS="`grep $2 $WI_HOME/conf/webindex-tests.txt | cut -d = -f 2`"
if [ -z "$TEST_ARGS" ]; then
echo "Unknown test ID: $2"
echo "Available tests:"
echo "----------------"
cat $WI_HOME/conf/webindex-tests.txt
exit 1
fi
COMMAND="$BIN_DIR/impl/test.sh $TEST_ARGS"
if [[ $@ != *"-fg"* ]]; then
COMMAND_LOGFILE=$WI_HOME/logs/test_"$2"_`date +%s`.log
nohup ${COMMAND} &> $COMMAND_LOGFILE &
echo "Started webindex run-test $2. Logs are being output to $COMMAND_LOGFILE"
else
${COMMAND}
fi
;;
kill)
: ${HADOOP_PREFIX?"HADOOP_PREFIX must be set"}
FLUO_APP=`get_prop fluoApp`
FLUO_CMD=$FLUO_HOME/bin/fluo
if [ ! -f $FLUO_CMD ]; then
echo "Fluo command script does not exist at $FLUO_CMD"
exit 1
fi
echo "Killing the webindex UI web server..."
pkill -9 -f webindex-ui
echo "Stopping the $FLUO_APP Fluo application (if running)..."
$FLUO_CMD stop $FLUO_APP
echo "Killing any webindex Spark jobs running in YARN..."
YARN=$HADOOP_PREFIX/bin/yarn
$YARN application -list | grep webindex | while read x; do yarn application -kill `echo $x | cut -d ' ' -f 1` ; done
;;
*)
echo -e "Usage: webindex <command> (<argument>)\n"
echo -e "Possible commands:\n"
echo " getpaths <DATE> Retrieves paths file for given crawl <DATE> (i.e 2015-18) and stores file in the 'paths/' directory"
echo " See https://commoncrawl.org/the-data/get-started/ for possible crawl dates"
echo " copy <DATE> <RANGE> <DEST> Copies CommonCrawl data files from S3 given a <DATE> and <RANGE> (i.e 0-8) into HDFS <DEST> directory"
echo " init [<SRC>] Initializes and starts the WebIndex application. Optionally, a <SRC> HDFS directory can be added to"
echo " to the command to initialize Fluo's table in Accumulo with data before starting the application"
echo " load-hdfs <SRC> Loads data from the HDFS <SRC> directory into Fluo"
echo " load-s3 <DATE> <RANGE> Loads data from S3 into Fluo. Data is selected using a paths file <DATE> and file <RANGE> (i.e 5-7)"
echo " ui Starts the webindex UI"
echo " splits <SRC> Calculate splits using data in HDFS <SRC> directory"
echo " kill Kills the webindex Fluo application and any webindex Spark jobs (if running)"
echo " test-id <ID> Starts a pre-configured webindex test indentified by <ID>. Run without arguments for a list of tests."
echo " test <args> Starts a webindex test. Each test will first remove any previously running test. It will"
echo " then initialize the webindex application, start the UI, and load data."
echo " Tests can be configured by the following arguments."
echo " Required args:"
echo " -d <DATE> Date of common crawl paths file (required)"
echo " -i <RANGE> Init data range (i.e START-END). Set to 'none' to not initialize with any data."
echo " -l <RANGE> Load data range (i.e START-END)"
echo " Optional args:"
echo " -s <SRC> Source (i.e. hdfs, s3) to use when loading data. If not set, defaults to 's3'"
echo " -e <NUM> Number of Spark executors to run in all Spark jobs"
echo " -m <MEM> Amount of memory (i.e 512m, 1g) to provide to each Spark executor"
echo " test-parser <DATE> <RANGE> Tests parser on data loaded from S3. Data is selected using a paths file <DATE> and file <RANGE> (i.e 5-7)"
echo " "
echo "NOTE: All commands except getpaths will run in background and output to a log by default. Add -fg to end of these commands"
echo "to run them in the foreground."
echo " "
exit 1
esac