blob: aa28e897bb796da88269e86d8daa4438f60a0b2f [file] [log] [blame]
#!/bin/sh
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Stop on first error.
set -e
# Print out usage information for this script.
function print_help {
echo "Usage: drat [crawl, index, map, reduce] in order to analyze a repository."
echo " Alternatively, call 'drat go' to run all four automatically."
echo " drat"
echo " go <path to repo> | start OODT and analyze the repository"
echo " crawl <path to repo> | crawl the repository files"
echo " index <path to repo> | index the crawled files"
echo " map | fire off the MapReduce mapper"
echo " reduce | fire off the MapReduce reducer"
echo " help | print this message"
echo " reset | prepare to analyze an entirely new repo"
echo " | CAUTION: will delete previous crawls!"
}
FILEMGR_URL=http://localhost:9000
SOLR_URL=http://localhost:8080/solr/drat
CLIENT_URL=http://localhost:9001
# Crawl the given repository. Expects one argument -- the file path of the repo to be crawled.
function crawl {
$DRAT_HOME/crawler/bin/crawler_launcher --operation --metPC --metExtractorConfig \
$DRAT_HOME/extractors/code/default.cpr.conf --metExtractor org.apache.oodt.cas.metadata.extractors.CopyAndRewriteExtractor \
--filemgrUrl $FILEMGR_URL --clientTransferer org.apache.oodt.cas.filemgr.datatransfer.InPlaceDataTransferFactory --productPath $1
}
# Index the crawled files of the given repo. Expects one argument -- the file path of the repo to be indexed.
function index {
java -Djava.ext.dirs=$DRAT_HOME/filemgr/lib -DSOLR_INDEXER_CONFIG=$DRAT_HOME/filemgr/etc/indexer.properties \
org.apache.oodt.cas.filemgr.tools.SolrIndexer --all --fmUrl $FILEMGR_URL --optimize --solrUrl $SOLR_URL $1
}
# Fire off the MapReduce mapper. Expects no arguments.
function map {
$DRAT_HOME/workflow/bin/wmgr-client --url $CLIENT_URL --operation --dynWorkflow --taskIds urn:drat:MimePartitioner
}
# Fire off the MapReduce reducer. Expects no arguments.
function reduce {
$DRAT_HOME/workflow/bin/wmgr-client --url $CLIENT_URL --operation --dynWorkflow --taskIds urn:drat:RatAggregator
}
# Ensure the number of arguments matches the expected number. Expects three arguments:
# the option name, the actual number of arguments, and the expected number of arguments.
function check_num_args {
if [[ "$2" != "$3" ]]; then
echo "Expected $(($3 - 1)) args for $1, but got $(($2 - 1))." # Use (( )) for arithmetic evaluation.
print_help
exit 1
fi
}
# Start parsing the arguments.
case $1 in
crawl)
check_num_args $1 $# 2
crawl $2
;;
index)
check_num_args $1 $# 2
index $2
;;
map)
check_num_args $1 $# 1
map
;;
reduce)
check_num_args $1 $# 1
reduce
;;
go)
# Add in some sleep just to give commands time to finish up. Some issues with Solr, otherwise.
check_num_args $1 $# 2
echo "Crawling $2"
crawl $2
sleep 1
echo
echo "Indexing $2"
index $2
sleep 1
echo
echo "Firing off the MapReduce mapper"
map
echo
echo "Firing off the MapReduce reducer"
reduce
echo "Navigate to http://localhost:8080/opsui/ to view the OODT browser and http://localhost:8080/solr to view the Solr catalog."
;;
reset)
check_num_args $1 $# 1
echo "Please stop OODT by running oodt stop before running reset."
echo "This will remove any previous or current crawls."
read -p "Do you wish to continue? [yN] " yn
case $yn in
[Yy]*)
echo
echo "rm -rf $DRAT_HOME/data/workflow"
rm -rf $DRAT_HOME/data/workflow
echo "rm -rf $DRAT_HOME/filemgr/catalog"
rm -rf $DRAT_HOME/filemgr/catalog
echo "rm -rf $DRAT_HOME/solr/drat/data"
rm -rf $DRAT_HOME/solr/drat/data
echo "rm -rf $DRAT_HOME/data/archive/*"
rm -rf $DRAT_HOME/data/archive/*
echo "Please restart OODT with oodt start if you with to run another crawl."
;;
[Nn]*)
echo "Reset cancelled. Exiting..."
exit 0
;;
*)
echo "Aborting..."
exit 1
;;
esac
;;
help)
print_help
;;
*)
echo "Unrecognized option: '$1'"
print_help
exit 1
;;
esac