blob: 65d398a9b3457529ccce57e781f56671afbb5bc9 [file] [log] [blame]
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Stop on first error.
set -e
# Print out usage information for this script.
function print_help {
echo "Usage: drat [crawl, index, map, reduce] in order to analyze a repository."
echo " Alternatively, call 'drat go' to run all four automatically."
echo " drat"
echo " go <path to repo> | start OODT and analyze the repository"
echo " crawl <path to repo> | crawl the repository files"
echo " index <path to repo> | index the crawled files"
echo " map | fire off the MapReduce mapper"
echo " reduce | fire off the MapReduce reducer"
echo " help | print this message"
echo " reset | prepare to analyze an entirely new repo"
echo " | CAUTION: will delete previous crawls!"
}
function print_ui_info {
echo "Navigate to http://localhost:8080/opsui/ to view the OODT browser and http://localhost:8080/solr to view the Solr catalog."
}
FILEMGR_URL=http://localhost:9000
SOLR_URL=http://localhost:8080/solr/drat
CLIENT_URL=http://localhost:9001
# Crawl the given repository. Expects one argument -- the file path of the repo to be crawled.
function crawl {
check_services_running
check_num_args "crawl" $# 1
pushd $DRAT_HOME/crawler/bin >> $DRAT_HOME/logs/drat.log 2>&1
./crawler_launcher --operation --metPC --metExtractorConfig \
$DRAT_HOME/extractors/code/default.cpr.conf --metExtractor org.apache.oodt.cas.metadata.extractors.CopyAndRewriteExtractor \
--filemgrUrl $FILEMGR_URL --clientTransferer org.apache.oodt.cas.filemgr.datatransfer.InPlaceDataTransferFactory --productPath $1
popd >> $DRAT_HOME/logs/drat.log 2>&1
}
# Index the crawled files of the given repo. Expects one argument -- the file path of the repo to be indexed.
function index {
check_services_running
check_num_args "index" $# 1
pushd $DRAT_HOME/filemgr/bin >> $DRAT_HOME/logs/drat.log 2>&1
java -Djava.ext.dirs=../lib -DSOLR_INDEXER_CONFIG=../etc/indexer.properties \
org.apache.oodt.cas.filemgr.tools.SolrIndexer --all --fmUrl $FILEMGR_URL --optimize --solrUrl $SOLR_URL $1
popd >> $DRAT_HOME/logs/drat.log 2>&1
}
# Fire off the MapReduce mapper. Expects no arguments.
function map {
check_services_running
check_num_args "map" $# 0
pushd $DRAT_HOME/workflow/bin >> $DRAT_HOME/logs/drat.log 2>&1
./wmgr-client --url $CLIENT_URL --operation --dynWorkflow --taskIds urn:drat:MimePartitioner
popd >> $DRAT_HOME/logs/drat.log 2>&1
print_ui_info
}
# Get the current list of RatAuditTasks running.
function current_pges {
STATUS="PGE%20EXEC"
tika="java -jar $DRAT_HOME/lib/tika-app-1.5.jar"
echo $($tika "http://localhost:8080/opsui/instances/${STATUS}/1" | grep -v FINISHED | grep RatCodeAudit)
}
# Fire off the MapReduce reducer. Expects no arguments.
function reduce {
check_services_running
check_num_args "reduce" $# 0
if [[ -n $(current_pges) ]]; then
echo "There are still MapReduce mappers running! It is reccomended you wait for them to finish, "
echo "then try to run '\$DRAT_HOME/bin/drat reduce' again later."
read -p "Are you sure you wish to continue? [yN] " yn
case $yn in
[Yy]*)
echo "Continuing..."
;;
*)
echo "Exiting..."
exit 0
;;
esac
fi
pushd $DRAT_HOME/workflow/bin >> $DRAT_HOME/logs/drat.log 2>&1
./wmgr-client --url $CLIENT_URL --operation --dynWorkflow --taskIds urn:drat:RatAggregator
popd >> $DRAT_HOME/logs/drat.log 2>&1
print_ui_info
}
# Ensure the number of arguments matches the expected number. Expects three arguments:
# the option name, the actual number of arguments, and the expected number of arguments.
function check_num_args {
if [[ "$2" != "$3" ]]; then
echo "Expected $3 args for '$1', but got $2."
print_help
exit 1
fi
}
# Return a list of what is running on a the given port. Expects 1 argument: the port number.
function check_port {
check_num_args "check port" $# 1
lsof -i tcp:$1
}
# Check the Solr and OODT ports. If no arguments are passed, ensure the ports are all busy.
# Otherwise, check the ports are not busy.
function check_services_running {
if [[ $# == 0 ]]; then
if [[ ! -n $(check_port 9000) ]] || [[ ! -n $(check_port 9001) ]] || [[ ! -n $(check_port 9002) ]] || [[ ! -n $(check_port 8080) ]]; then
echo "Please start OODT by running '\$DRAT_HOME/bin/oodt start'"
echo "Aborting..."
exit 1
fi
elif [[ -n $(check_port 9000) ]] || [[ -n $(check_port 9001) ]] || [[ -n $(check_port 9002) ]] || [[ -n $(check_port 8080) ]]; then
echo "Please stop OODT by running '\$DRAT_HOME/bin/oodt stop'"
echo "Aborting..."
exit 1
fi
}
# Attempt to automate crawling, indexing, mapping, and reducing. Expects 1 argument: the directory to analyze.
function go {
check_services_running
check_num_args "go" $# 1
echo "Crawling $2"
crawl $1
sleep 3
echo
echo "Indexing $2"
index $1
sleep 3
echo
echo "Firing off the MapReduce mapper"
map
echo
printf "Waiting for the mapping and partitioning to finish..."
sleep 3
while [[ -n $(current_pges) ]]; do
for (( i = 0; i < 10; i++ )); do
printf "."
sleep .5
done
done
echo
reduce
}
# Reset drat to prepare for analyzing an entirely new repo. Expects no arguments.
function reset {
check_services_running "running"
check_num_args "reset" $# 0
echo "This will remove any previous or current crawls."
read -p "Do you wish to continue? [yN] " yn
case $yn in
[Yy]*)
echo "rm -rf $DRAT_HOME/data/workflow"
rm -rf $DRAT_HOME/data/workflow
echo "rm -rf $DRAT_HOME/filemgr/catalog"
rm -rf $DRAT_HOME/filemgr/catalog
echo "rm -rf $DRAT_HOME/solr/drat/data"
rm -rf $DRAT_HOME/solr/drat/data
echo "rm -rf $DRAT_HOME/data/archive/*"
rm -rf $DRAT_HOME/data/archive/*
echo "rm -rf $DRAT_HOME/data/jobs/*"
rm -rf $DRAT_HOME/data/jobs/*
echo "Please restart OODT with '\$DRAT_HOME/bin/oodt start' if you wish to run another crawl."
;;
[Nn]*)
echo "Reset cancelled. Exiting..."
exit 0
;;
*)
echo "Aborting..."
exit 1
;;
esac
}
# Start parsing the arguments.
case $1 in
crawl)
crawl $2
;;
index)
index $2
;;
map)
map
;;
reduce)
reduce
;;
go)
go $2
;;
reset)
reset
;;
help)
print_help
;;
*)
echo "Unrecognized option: '$1'"
print_help
exit 1
;;
esac