blob: 2449d6eaa1cccb628aaf201e8fb9efa40a606b69 [file] [log] [blame]
/*
Original script:
http://www.adick.at/2008-10-15,recrawl-script-for-nutch/
*/
// basic initializations
evaluate(new File("scripts/nutch_base.gv"))
try
{
// start the script
echo "----- Inject (Step 1 of $steps) -----"
execNutch (["inject", "${crawl_dir}/crawldb", "urls"])
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for(i=0; i <depth; i++)
{
echo "--- Beginning crawl at depth ${i + 1} of ${depth} ---"
execNutch(["generate", "$crawl_dir/crawldb", "$crawl_dir/segments", "-adddays", "$adddays"])
if (err != 0)
{
echo "runbot: Stopping at depth ${i +1}. No more URLs to fetch."
break
}
segment=lastSegment( "$crawl_dir/segments/")
execNutch (["fetch", "$segment", "-threads ${threads}"])
if (err != 0)
{
echo "runbot: fetch $segment at depth ${depth} failed. Deleting it."
rm_rf("$segment")
continue
}
echo "--- Parsing Segment $segment ---"
execNutch (["parse", "$segment"])
execNutch (["updatedb", "$crawl_dir/crawldb", "$segment"])
}
echo "----- Stopping Tomcat (Step 3 of $steps) -----"
tomcat.stop()
echo "----- Merge Segments (Step 4 of $steps) -----"
rm_rf("$crawl_dir/MERGEDsegments")
execNutch (["mergesegs", "$crawl_dir/MERGEDsegments", "-dir", "$crawl_dir/segments/"])
println "err $err"
if (err == 0)
{
if (!safe)
rm_rf ("$crawl_dir/segments/*")
else
{
new File("$crawl_dir/FETCHEDsegments").mkdir()
mv ("$crawl_dir/segments/*", "$crawl_dir/FETCHEDsegments")
}
mv ("$crawl_dir/MERGEDsegments/*", "$crawl_dir/segments")
rm_rf ("$crawl_dir/MERGEDsegments")
}
else
{
println "error in merge -> abort"
System.exit(err)
}
echo "----- Invert Links (Step 5 of $steps) -----"
execNutch (["invertlinks", "$crawl_dir/linkdb", "$crawl_dir/segments/*"])
echo "----- Index (Step 6 of $steps) -----"
rm_rf("$crawl_dir/NEWindexes")
rm_rf ("$crawl_dir/OLDindexes")
execNutch (["index", "$crawl_dir/NEWindexes", "$crawl_dir/crawldb", "$crawl_dir/linkdb", "$crawl_dir/segments/*"])
echo "----- Dedup (Step 7 of $steps) -----"
execNutch (["dedup", "$crawl_dir/NEWindexes"])
echo "----- Merge Indexes (Step 8 of $steps) -----"
execNutch (["merge", "$crawl_dir/MERGEDindexes", "$crawl_dir/NEWindexes"])
// in nutch-site, hadoop.tmp.dir points to crawl/tmp
rm_rf ("$crawl_dir/tmp/*")
// replace indexes with indexes_merged
file = new File("$crawl_dir/OLDindexes")
if (!file.exists()) file.mkdir()
mv ("$crawl_dir/index/*", "$crawl_dir/OLDindexes")
mv ("$crawl_dir/MERGEDindexes/*", "$crawl_dir/index")
// clean up old indexes directories
if(!safe)
{
rm_rf ("$crawl_dir/NEWindexes")
rm_rf ("$crawl_dir/OLDindexes")
}
echo "----- Reloading index on the search site (Step 9 of $steps) -----"
if(safe)
{
touch ("${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml")
echo "Done!"
}
else
{
echo "runbot: Can not reload index in safe mode."
echo "runbot: Please reload it manually using the following command:"
echo "runbot: touch ${CATALINA_HOME}/webapps/ROOT/WEB-INF/web.xml"
}
echo "----- Restarting Tomcat (Step 10 of $steps) -----"
tomcat.stop()
tomcat.start()
echo "runbot: FINISHED: Crawl completed!"
}
catch (Throwable ex)
{
ex.printStackTrace()
}