Merge pull request #521 from sebastian-nagel/NUTCH-2002-checkers-robotstxt

NUTCH-2002 parse and index checkers to check robots.txt
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 5dcd2ea..04c2ae8 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -841,6 +841,14 @@
           String.format(Locale.ROOT, "%6d", counter.getValue()),
           counter.getName());
     }
+    if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
+      /*
+       * generated items are not marked in CrawlDb, and CrawlDb will not
+       * accessed anymore: we already can release the lock
+       */
+      LockUtil.removeLockFile(getConf(), lock);
+      lock = null;
+    }
 
     // read the subdirectories generated in the temp
     // output and turn them into segments
@@ -858,15 +866,13 @@
       }
     } catch (Exception e) {
       LOG.warn("Generator: exception while partitioning segments, exiting ...");
-      LockUtil.removeLockFile(getConf(), lock);
-      fs.delete(tempDir, true);
+      NutchJob.cleanupAfterFailure(tempDir, lock, fs);
       return null;
     }
 
     if (generatedSegments.size() == 0) {
       LOG.warn("Generator: 0 records selected for fetching, exiting ...");
-      LockUtil.removeLockFile(getConf(), lock);
-      fs.delete(tempDir, true);
+      NutchJob.cleanupAfterFailure(tempDir, lock, fs);
       return null;
     }
 
@@ -913,7 +919,9 @@
       fs.delete(tempDir2, true);
     }
 
-    LockUtil.removeLockFile(getConf(), lock);
+    if (lock != null) {
+      LockUtil.removeLockFile(getConf(), lock);
+    }
     fs.delete(tempDir, true);
 
     long end = System.currentTimeMillis();
diff --git a/src/java/org/apache/nutch/parse/HTMLMetaTags.java b/src/java/org/apache/nutch/parse/HTMLMetaTags.java
index 7c301e1..23a9339 100644
--- a/src/java/org/apache/nutch/parse/HTMLMetaTags.java
+++ b/src/java/org/apache/nutch/parse/HTMLMetaTags.java
@@ -68,6 +68,13 @@
   }
 
   /**
+   * Sets <code>noFollow</code> to <code>false</code>.
+   */
+  public void setFollow() {
+    noFollow = false;
+  }
+
+  /**
    * Sets <code>noIndex</code> to <code>true</code>.
    */
   public void setNoIndex() {
@@ -75,6 +82,13 @@
   }
 
   /**
+   * Sets <code>noIndex</code> to <code>false</code>.
+   */
+  public void setIndex() {
+    noIndex = false;
+  }
+
+  /**
    * Sets <code>noCache</code> to <code>true</code>.
    */
   public void setNoCache() {
@@ -82,6 +96,13 @@
   }
 
   /**
+   * Sets <code>noCache</code> to <code>false</code>.
+   */
+  public void setCache() {
+    noCache = false;
+  }
+
+  /**
    * Sets <code>refresh</code> to the supplied value.
    */
   public void setRefresh(boolean refresh) {
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java
index 8c537d9..b51be74 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -146,27 +146,33 @@
   @Override
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err
-          .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
+      System.err.println(
+          "Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize] [-numFetchers <n>]");
       System.err
           .println("\tinputDir\tinput directory containing one or more input files.");
       System.err
-          .println("\t\tEach text file contains a list of URLs, one URL per line");
+          .println("\t        \tEach text file contains a list of URLs, one URL per line");
       System.err
           .println("\tsegmentsDir\toutput directory, where new segment will be created");
-      System.err.println("\t-filter\trun current URLFilters on input URLs");
+      System.err.println("\t-filter   \trun current URLFilters on input URLs");
       System.err
           .println("\t-normalize\trun current URLNormalizers on input URLs");
+      System.err.println(
+          "\t-numFetchers <n>\tnumber of generated fetch lists, determines number of fetcher tasks");
       return -1;
     }
     boolean filter = false;
     boolean normalize = false;
+    int numFetchers = -1;
     if (args.length > 2) {
       for (int i = 2; i < args.length; i++) {
         if (args[i].equals("-filter")) {
           filter = true;
         } else if (args[i].equals("-normalize")) {
           normalize = true;
+        } else if ("-numFetchers".equals(args[i])) {
+          numFetchers = Integer.parseInt(args[i + 1]);
+          i++;
         } else {
           LOG.error("Unknown argument: " + args[i] + ", exiting ...");
           return -1;
@@ -191,7 +197,17 @@
     job.setPartitionerClass(URLPartitioner.class);
     job.setReducerClass(FG.FGReducer.class);
     String segName = Generator.generateSegmentName();
-    job.setNumReduceTasks(Integer.parseInt(conf.get("mapreduce.job.maps")));
+    if (numFetchers == -1) {
+      /* for politeness create exactly one partition per fetch task */
+      numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
+    }
+    if ("local".equals(conf.get("mapreduce.framework.name"))
+        && numFetchers != 1) {
+      // override
+      LOG.info(
+          "FreeGenerator: running in local mode, generating exactly one partition.");
+      numFetchers = 1;
+    }
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 991e506..13257d2 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -41,7 +41,15 @@
     return Job.getInstance(conf);
   } 
 
-  /*
+  /**
+   * Clean up the file system in case of a job failure.
+   */
+  public static void cleanupAfterFailure(Path tempDir, FileSystem fs)
+      throws IOException {
+    cleanupAfterFailure(tempDir, null, fs);
+  }
+
+  /**
    * Clean up the file system in case of a job failure.
    */
   public static void cleanupAfterFailure(Path tempDir, Path lock, FileSystem fs)
@@ -50,7 +58,9 @@
       if (fs.exists(tempDir)) {
         fs.delete(tempDir, true);
       }
-      LockUtil.removeLockFile(fs, lock);
+      if (lock != null) {
+        LockUtil.removeLockFile(fs, lock);
+      }
     } catch (IOException e) {
       LOG.error("NutchJob cleanup failed: {}", e.getMessage());
       throw e;