| #! /usr/bin/env bash |
| |
| # Copyright 2015 Webindex authors (see AUTHORS) |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| BIN_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) |
| export WI_HOME=$( cd "$( dirname "$BIN_DIR" )" && pwd ) |
| |
| if [ -f $WI_HOME/conf/webindex-env.sh ]; then |
| . $WI_HOME/conf/webindex-env.sh |
| else |
| . $WI_HOME/conf/webindex-env.sh.example |
| fi |
| |
| : ${HADOOP_CONF_DIR?"HADOOP_CONF_DIR must be set in bash env or conf/webindex-env.sh"} |
| if [ ! -d $HADOOP_CONF_DIR ]; then |
| echo "HADOOP_CONF_DIR=$HADOOP_CONF_DIR does not exist" |
| exit 1 |
| fi |
| : ${FLUO_HOME?"FLUO_HOME must be set in bash env or conf/webindex-env.sh"} |
| if [ ! -d $FLUO_HOME ]; then |
| echo "FLUO_HOME=$FLUO_HOME does not exist" |
| exit 1 |
| fi |
| |
| mkdir -p $WI_HOME/logs |
| |
| export DATA_CONFIG=$WI_HOME/conf/data.yml |
| if [ ! -f $DATA_CONFIG ]; then |
| export DATA_CONFIG=$WI_HOME/conf/data.yml.example |
| if [ ! -f $DATA_CONFIG ]; then |
| echo "Could not find data.yml or data.yml.example in $WI_HOME/conf" |
| exit 1 |
| fi |
| echo "Using default config at $DATA_CONFIG" |
| fi |
| |
| function get_prop { |
| echo "`grep $1 $DATA_CONFIG | cut -d ' ' -f 2`" |
| } |
| |
| : ${WI_EXECUTOR_INSTANCES?"WI_EXECUTOR_INSTANCES must be set in bash env or conf/webindex-env.sh"} |
| : ${WI_EXECUTOR_MEMORY?"WI_EXECUTOR_MEMORY must be set in bash env or conf/webindex-env.sh"} |
| export COMMON_SPARK_OPTS="--master yarn-client --num-executors $WI_EXECUTOR_INSTANCES --executor-memory $WI_EXECUTOR_MEMORY" |
| COMMAND_LOGFILE=$WI_HOME/logs/$1_`date +%s`.log |
| |
| case "$1" in |
| getpaths) |
| PATHS_DIR=$WI_HOME/paths |
| mkdir -p $PATHS_DIR |
| PATHS_FILE="$2".wat.paths |
| if [ ! -f $PATHS_DIR/$PATHS_FILE ]; then |
| rm -f $PATHS_DIR/wat.paths.gz |
| PATHS_URL=https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-$2/wat.paths.gz |
| if [[ `wget -S --spider $PATHS_URL 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then |
| wget -P $PATHS_DIR $PATHS_URL |
| gzip -d $PATHS_DIR/wat.paths.gz |
| mv $PATHS_DIR/wat.paths $PATHS_DIR/$PATHS_FILE |
| echo "Downloaded paths file to $PATHS_DIR/$PATHS_FILE" |
| else |
| echo "Crawl paths file for date $2 does not exist at $PATHS_URL" |
| exit 1 |
| fi |
| else |
| echo "Crawl paths file already exists at $PATHS_DIR/$PATHS_FILE" |
| fi |
| ;; |
| copy) |
| if [ "$#" -lt 4 -o "$#" -gt 5 ]; then |
| echo "Usage: webindex copy <DATE> <RANGE> <DEST> [-fg]" |
| exit 1 |
| fi |
| . $BIN_DIR/impl/base.sh |
| COMMAND="$SPARK_SUBMIT --class webindex.data.Copy $COMMON_SPARK_OPTS \ |
| $WI_DATA_DEP_JAR $WI_HOME/paths/"$2".wat.paths $3 $4" |
| if [ "$5" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started copy. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| init) |
| if [ "$#" -lt 1 -o "$#" -gt 3 ]; then |
| echo "Usage: webindex init <SRC> [-fg]" |
| exit 1 |
| fi |
| . $BIN_DIR/impl/base.sh |
| COMMAND="$BIN_DIR/impl/init.sh $2" |
| if [ "$2" == "-fg" ]; then |
| COMMAND="$BIN_DIR/impl/init.sh" |
| fi |
| if [ "$2" != "-fg" -a "$3" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started init. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| load-hdfs) |
| if [ "$#" -lt 2 -o "$#" -gt 3 ]; then |
| echo "Usage: webindex load-hdfs <SRC> [-fg]" |
| exit 1 |
| fi |
| . $BIN_DIR/impl/base.sh |
| FLUO_PROPS=$FLUO_HOME/apps/`get_prop fluoApp`/conf/fluo.properties |
| if [ ! -f $FLUO_PROPS ]; then |
| echo "Fluo properties file must exist at $FLUO_PROPS" |
| exit 1 |
| fi |
| COMMAND="$SPARK_SUBMIT --class webindex.data.LoadHdfs $COMMON_SPARK_OPTS \ |
| --files $FLUO_PROPS $WI_DATA_DEP_JAR $2" |
| if [ "$3" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started load-hdfs. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| load-s3) |
| if [ "$#" -lt 3 -o "$#" -gt 4 ]; then |
| echo "Usage: webindex load-s3 <DATE> <RANGE> [-fg]" |
| exit 1 |
| fi |
| . $BIN_DIR/impl/base.sh |
| FLUO_PROPS=$FLUO_HOME/apps/`get_prop fluoApp`/conf/fluo.properties |
| if [ ! -f $FLUO_PROPS ]; then |
| echo "Fluo properties file must exist at $FLUO_PROPS" |
| exit 1 |
| fi |
| COMMAND="$SPARK_SUBMIT --class webindex.data.LoadS3 $COMMON_SPARK_OPTS \ |
| --files $FLUO_PROPS $WI_DATA_DEP_JAR $WI_HOME/paths/"$2".wat.paths $3" |
| if [ "$4" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started load-s3. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| test-parser) |
| if [ "$#" -lt 3 -o "$#" -gt 4 ]; then |
| echo "Usage: webindex test-parser <DATE> <RANGE> [-fg]" |
| exit 1 |
| fi |
| . $BIN_DIR/impl/base.sh |
| COMMAND="$SPARK_SUBMIT --class webindex.data.TestParser $COMMON_SPARK_OPTS \ |
| $WI_DATA_DEP_JAR $WI_HOME/paths/"$2".wat.paths $3" |
| if [ "$4" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started data-verify. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| ui) |
| pkill -9 -f webindex-ui |
| WI_UI_JAR=$WI_HOME/modules/ui/target/webindex-ui-0.0.1-SNAPSHOT.jar |
| if [ ! -f $WI_UI_JAR ]; then |
| cd $WI_HOME/modules/ui |
| mvn clean install -DskipTests |
| fi |
| DROPWIZARD_CONFIG="" |
| if [ -f $WI_HOME/conf/dropwizard.yml ]; then |
| DROPWIZARD_CONFIG=$WI_HOME/conf/dropwizard.yml |
| echo "Running with dropwizard config at $DROPWIZARD_CONFIG" |
| fi |
| COMMAND="java -jar $WI_UI_JAR server $DROPWIZARD_CONFIG" |
| if [ "$2" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started UI. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| splits) |
| . $BIN_DIR/impl/base.sh |
| COMMAND="$SPARK_SUBMIT --class webindex.data.CalcSplits \ |
| $COMMON_SPARK_OPTS \ |
| --conf spark.shuffle.service.enabled=true \ |
| $WI_DATA_DEP_JAR $2" |
| if [ "$2" != "-fg" ]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started splits calculation. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| test) |
| COMMAND="$BIN_DIR/impl/test.sh ${@:2}" |
| if [[ $@ != *"-fg"* ]]; then |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started webindex test ${@:2}. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| test-id) |
| if [ "$#" -gt 3 ]; then |
| echo "Usage: webindex test-id <ID> [-fg]" |
| exit 1 |
| fi |
| if [ -z "$2" ]; then |
| echo "Available tests:" |
| echo "----------------" |
| cat $WI_HOME/conf/webindex-tests.txt |
| exit 1 |
| fi |
| TEST_ARGS="`grep $2 $WI_HOME/conf/webindex-tests.txt | cut -d = -f 2`" |
| if [ -z "$TEST_ARGS" ]; then |
| echo "Unknown test ID: $2" |
| echo "Available tests:" |
| echo "----------------" |
| cat $WI_HOME/conf/webindex-tests.txt |
| exit 1 |
| fi |
| COMMAND="$BIN_DIR/impl/test.sh $TEST_ARGS" |
| if [[ $@ != *"-fg"* ]]; then |
| COMMAND_LOGFILE=$WI_HOME/logs/test_"$2"_`date +%s`.log |
| nohup ${COMMAND} &> $COMMAND_LOGFILE & |
| echo "Started webindex run-test $2. Logs are being output to $COMMAND_LOGFILE" |
| else |
| ${COMMAND} |
| fi |
| ;; |
| kill) |
| : ${HADOOP_PREFIX?"HADOOP_PREFIX must be set"} |
| FLUO_APP=`get_prop fluoApp` |
| FLUO_CMD=$FLUO_HOME/bin/fluo |
| if [ ! -f $FLUO_CMD ]; then |
| echo "Fluo command script does not exist at $FLUO_CMD" |
| exit 1 |
| fi |
| echo "Killing the webindex UI web server..." |
| pkill -9 -f webindex-ui |
| |
| echo "Stopping the $FLUO_APP Fluo application (if running)..." |
| $FLUO_CMD stop $FLUO_APP |
| |
| echo "Killing any webindex Spark jobs running in YARN..." |
| YARN=$HADOOP_PREFIX/bin/yarn |
| $YARN application -list | grep webindex | while read x; do yarn application -kill `echo $x | cut -d ' ' -f 1` ; done |
| ;; |
| *) |
| echo -e "Usage: webindex <command> (<argument>)\n" |
| echo -e "Possible commands:\n" |
| echo " getpaths <DATE> Retrieves paths file for given crawl <DATE> (i.e 2015-18) and stores file in the 'paths/' directory" |
| echo " See https://commoncrawl.org/the-data/get-started/ for possible crawl dates" |
| echo " copy <DATE> <RANGE> <DEST> Copies CommonCrawl data files from S3 given a <DATE> and <RANGE> (i.e 0-8) into HDFS <DEST> directory" |
| echo " init [<SRC>] Initializes and starts the WebIndex application. Optionally, a <SRC> HDFS directory can be added to" |
| echo " to the command to initialize Fluo's table in Accumulo with data before starting the application" |
| echo " load-hdfs <SRC> Loads data from the HDFS <SRC> directory into Fluo" |
| echo " load-s3 <DATE> <RANGE> Loads data from S3 into Fluo. Data is selected using a paths file <DATE> and file <RANGE> (i.e 5-7)" |
| echo " ui Starts the webindex UI" |
| echo " splits <SRC> Calculate splits using data in HDFS <SRC> directory" |
| echo " kill Kills the webindex Fluo application and any webindex Spark jobs (if running)" |
| echo " test-id <ID> Starts a pre-configured webindex test indentified by <ID>. Run without arguments for a list of tests." |
| echo " test <args> Starts a webindex test. Each test will first remove any previously running test. It will" |
| echo " then initialize the webindex application, start the UI, and load data." |
| echo " Tests can be configured by the following arguments." |
| echo " Required args:" |
| echo " -d <DATE> Date of common crawl paths file (required)" |
| echo " -i <RANGE> Init data range (i.e START-END). Set to 'none' to not initialize with any data." |
| echo " -l <RANGE> Load data range (i.e START-END)" |
| echo " Optional args:" |
| echo " -s <SRC> Source (i.e. hdfs, s3) to use when loading data. If not set, defaults to 's3'" |
| echo " -e <NUM> Number of Spark executors to run in all Spark jobs" |
| echo " -m <MEM> Amount of memory (i.e 512m, 1g) to provide to each Spark executor" |
| echo " test-parser <DATE> <RANGE> Tests parser on data loaded from S3. Data is selected using a paths file <DATE> and file <RANGE> (i.e 5-7)" |
| echo " " |
| echo "NOTE: All commands except getpaths will run in background and output to a log by default. Add -fg to end of these commands" |
| echo "to run them in the foreground." |
| echo " " |
| exit 1 |
| esac |