| #!/bin/bash |
| |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| # Stop on first error. |
| set -e |
| |
| # Print out usage information for this script. |
| function print_help { |
| echo "Usage: drat [crawl, index, map, reduce] in order to analyze a repository." |
| echo " Alternatively, call 'drat go' to run all four automatically." |
| echo " drat" |
| echo " go <path to repo> | start OODT and analyze the repository" |
| echo " crawl <path to repo> | crawl the repository files" |
| echo " index <path to repo> | index the crawled files" |
| echo " map | fire off the MapReduce mapper" |
| echo " reduce | fire off the MapReduce reducer" |
| echo " help | print this message" |
| echo " reset | prepare to analyze an entirely new repo" |
| echo " | CAUTION: will delete previous crawls!" |
| } |
| |
| function print_ui_info { |
| echo "Navigate to http://localhost:8080/opsui/ to view the OODT browser and http://localhost:8080/solr to view the Solr catalog." |
| } |
| |
| FILEMGR_URL=http://localhost:9000 |
| SOLR_URL=http://localhost:8080/solr/drat |
| CLIENT_URL=http://localhost:9001 |
| |
| # Crawl the given repository. Expects one argument -- the file path of the repo to be crawled. |
| function crawl { |
| check_services_running |
| check_num_args "crawl" $# 1 |
| pushd $DRAT_HOME/crawler/bin >> $DRAT_HOME/logs/drat.log 2>&1 |
| ./crawler_launcher --operation --metPC --metExtractorConfig \ |
| $DRAT_HOME/extractors/code/default.cpr.conf --metExtractor org.apache.oodt.cas.metadata.extractors.CopyAndRewriteExtractor \ |
| --filemgrUrl $FILEMGR_URL --clientTransferer org.apache.oodt.cas.filemgr.datatransfer.InPlaceDataTransferFactory --productPath $1 |
| popd >> $DRAT_HOME/logs/drat.log 2>&1 |
| } |
| |
| # Index the crawled files of the given repo. Expects one argument -- the file path of the repo to be indexed. |
| function index { |
| check_services_running |
| check_num_args "index" $# 1 |
| pushd $DRAT_HOME/filemgr/bin >> $DRAT_HOME/logs/drat.log 2>&1 |
| java -Djava.ext.dirs=../lib -DSOLR_INDEXER_CONFIG=../etc/indexer.properties \ |
| org.apache.oodt.cas.filemgr.tools.SolrIndexer --all --fmUrl $FILEMGR_URL --optimize --solrUrl $SOLR_URL $1 |
| popd >> $DRAT_HOME/logs/drat.log 2>&1 |
| } |
| |
| # Fire off the MapReduce mapper. Expects no arguments. |
| function map { |
| check_services_running |
| check_num_args "map" $# 0 |
| pushd $DRAT_HOME/workflow/bin >> $DRAT_HOME/logs/drat.log 2>&1 |
| ./wmgr-client --url $CLIENT_URL --operation --dynWorkflow --taskIds urn:drat:MimePartitioner |
| popd >> $DRAT_HOME/logs/drat.log 2>&1 |
| print_ui_info |
| } |
| |
| # Get the current list of RatAuditTasks running. |
| function current_pges { |
| STATUS="PGE%20EXEC" |
| tika="java -jar $DRAT_HOME/lib/tika-app-1.5.jar" |
| echo $($tika "http://localhost:8080/opsui/instances/${STATUS}/1" | grep -v FINISHED | grep RatCodeAudit) |
| } |
| |
| # Fire off the MapReduce reducer. Expects no arguments. |
| function reduce { |
| check_services_running |
| check_num_args "reduce" $# 0 |
| if [[ -n $(current_pges) ]]; then |
| echo "There are still MapReduce mappers running! It is reccomended you wait for them to finish, " |
| echo "then try to run '\$DRAT_HOME/bin/drat reduce' again later." |
| read -p "Are you sure you wish to continue? [yN] " yn |
| case $yn in |
| [Yy]*) |
| echo "Continuing..." |
| ;; |
| *) |
| echo "Exiting..." |
| exit 0 |
| ;; |
| esac |
| fi |
| pushd $DRAT_HOME/workflow/bin >> $DRAT_HOME/logs/drat.log 2>&1 |
| ./wmgr-client --url $CLIENT_URL --operation --dynWorkflow --taskIds urn:drat:RatAggregator |
| popd >> $DRAT_HOME/logs/drat.log 2>&1 |
| print_ui_info |
| } |
| |
| # Ensure the number of arguments matches the expected number. Expects three arguments: |
| # the option name, the actual number of arguments, and the expected number of arguments. |
| function check_num_args { |
| if [[ "$2" != "$3" ]]; then |
| echo "Expected $3 args for '$1', but got $2." |
| print_help |
| exit 1 |
| fi |
| } |
| |
| # Return a list of what is running on a the given port. Expects 1 argument: the port number. |
| function check_port { |
| check_num_args "check port" $# 1 |
| lsof -i tcp:$1 |
| } |
| |
| # Check the Solr and OODT ports. If no arguments are passed, ensure the ports are all busy. |
| # Otherwise, check the ports are not busy. |
| function check_services_running { |
| if [[ $# == 0 ]]; then |
| if [[ ! -n $(check_port 9000) ]] || [[ ! -n $(check_port 9001) ]] || [[ ! -n $(check_port 9002) ]] || [[ ! -n $(check_port 8080) ]]; then |
| echo "Please start OODT by running '\$DRAT_HOME/bin/oodt start'" |
| echo "Aborting..." |
| exit 1 |
| fi |
| elif [[ -n $(check_port 9000) ]] || [[ -n $(check_port 9001) ]] || [[ -n $(check_port 9002) ]] || [[ -n $(check_port 8080) ]]; then |
| echo "Please stop OODT by running '\$DRAT_HOME/bin/oodt stop'" |
| echo "Aborting..." |
| exit 1 |
| fi |
| |
| } |
| |
| # Attempt to automate crawling, indexing, mapping, and reducing. Expects 1 argument: the directory to analyze. |
| function go { |
| check_services_running |
| check_num_args "go" $# 1 |
| echo "Crawling $2" |
| crawl $1 |
| sleep 3 |
| echo |
| echo "Indexing $2" |
| index $1 |
| sleep 3 |
| echo |
| echo "Firing off the MapReduce mapper" |
| map |
| echo |
| printf "Waiting for the mapping and partitioning to finish..." |
| sleep 3 |
| while [[ -n $(current_pges) ]]; do |
| for (( i = 0; i < 10; i++ )); do |
| printf "." |
| sleep .5 |
| done |
| done |
| echo |
| reduce |
| } |
| |
| # Reset drat to prepare for analyzing an entirely new repo. Expects no arguments. |
| function reset { |
| check_services_running "running" |
| check_num_args "reset" $# 0 |
| echo "This will remove any previous or current crawls." |
| read -p "Do you wish to continue? [yN] " yn |
| case $yn in |
| [Yy]*) |
| echo "rm -rf $DRAT_HOME/data/workflow" |
| rm -rf $DRAT_HOME/data/workflow |
| echo "rm -rf $DRAT_HOME/filemgr/catalog" |
| rm -rf $DRAT_HOME/filemgr/catalog |
| echo "rm -rf $DRAT_HOME/solr/drat/data" |
| rm -rf $DRAT_HOME/solr/drat/data |
| echo "rm -rf $DRAT_HOME/data/archive/*" |
| rm -rf $DRAT_HOME/data/archive/* |
| echo "Please restart OODT with '\$DRAT_HOME/bin/oodt start' if you wish to run another crawl." |
| ;; |
| [Nn]*) |
| echo "Reset cancelled. Exiting..." |
| exit 0 |
| ;; |
| *) |
| echo "Aborting..." |
| exit 1 |
| ;; |
| esac |
| } |
| |
| # Start parsing the arguments. |
| case $1 in |
| crawl) |
| crawl $2 |
| ;; |
| index) |
| index $2 |
| ;; |
| map) |
| map |
| ;; |
| reduce) |
| reduce |
| ;; |
| go) |
| go $2 |
| ;; |
| reset) |
| reset |
| ;; |
| help) |
| print_help |
| ;; |
| *) |
| echo "Unrecognized option: '$1'" |
| print_help |
| exit 1 |
| ;; |
| esac |