Merge pull request #486 from sebastian-nagel/NUTCH-2184-indexer-no-crawldb
NUTCH-2184 Enable IndexingJob to function with no crawldb
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index fedfeb7..cfb6dea 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -50,6 +50,24 @@
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+/**
+ * <p>
+ * This class is typically invoked from within
+ * {@link org.apache.nutch.indexer.IndexingJob} and handles all MapReduce
+ * functionality required when undertaking indexing.
+ * </p>
+ * <p>
+ * This is a consequence of one or more indexing plugins being invoked which
+ * extend {@link org.apache.nutch.indexer.IndexWriter}.
+ * </p>
+ * <p>
+ * See
+ * {@link org.apache.nutch.indexer.IndexerMapReduce#initMRJob(Path, Path, Collection, JobConf, boolean)}
+ * for details on the specific data structures and parameters required for
+ * indexing.
+ * </p>
+ *
+ */
public class IndexerMapReduce extends Configured {
private static final Logger LOG = LoggerFactory
@@ -100,7 +118,7 @@
.normalize(url, URLNormalizers.SCOPE_INDEXER);
normalized = normalized.trim();
} catch (Exception e) {
- LOG.warn("Skipping " + url + ":" + e);
+ LOG.warn("Skipping {}: {}", url, e);
normalized = null;
}
}
@@ -418,12 +436,26 @@
public static void initMRJob(Path crawlDb, Path linkDb,
Collection<Path> segments, Job job, boolean addBinaryContent) throws IOException{
- LOG.info("IndexerMapReduce: crawldb: {}", crawlDb);
-
- if (linkDb != null)
- LOG.info("IndexerMapReduce: linkdb: {}", linkDb);
-
Configuration conf = job.getConfiguration();
+
+ if (crawlDb != null) {
+ LOG.info("IndexerMapReduce: crawldb: {}", crawlDb);
+ Path currentCrawlDb = new Path(crawlDb, CrawlDb.CURRENT_NAME);
+ try {
+ if (currentCrawlDb.getFileSystem(conf).exists(currentCrawlDb)) {
+ FileInputFormat.addInputPath(job, currentCrawlDb);
+ } else {
+ LOG.warn(
+ "Ignoring crawlDb for indexing, no crawlDb found in path: {}",
+ crawlDb);
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to use crawlDb ({}) for indexing", crawlDb, e);
+ }
+ } else {
+ LOG.info("IndexerMapReduce: no crawldb provided for indexing");
+ }
+
for (final Path segment : segments) {
LOG.info("IndexerMapReduces: adding segment: {}", segment);
FileInputFormat.addInputPath(job, new Path(segment,
@@ -438,9 +470,8 @@
}
}
- FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
-
if (linkDb != null) {
+ LOG.info("IndexerMapReduce: linkdb: {}", linkDb);
Path currentLinkDb = new Path(linkDb, LinkDb.CURRENT_NAME);
try {
if (currentLinkDb.getFileSystem(conf).exists(currentLinkDb)) {
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index 30c5504..e476adc 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -167,15 +167,59 @@
}
}
+ private static void usage() {
+ System.err.println(
+ "Usage: Indexer (<crawldb> | -nocrawldb) (<segment> ... | -dir <segments>) [general options]");
+ System.err.println("");
+ System.err.println("Index given segments using configured indexer plugins");
+ System.err.println("");
+ System.err.println(
+ "The CrawlDb is optional but it is required to send deletion requests for duplicates");
+ System.err.println(
+ "and to read the proper document score/boost/weight passed to the indexers.");
+ System.err.println("");
+ System.err.println("Required arguments:");
+ System.err.println("");
+ System.err.println("\t<crawldb>\tpath to CrawlDb, or");
+ System.err.println(
+ "\t-nocrawldb\tflag to indicate that no CrawlDb shall be used");
+ System.err.println("");
+ System.err.println("\t<segment> ...\tpath(s) to segment, or");
+ System.err.println("\t-dir <segments>\tpath to segments/ directory,");
+ System.err.println(
+ "\t \t(all subdirectories are read as segments)");
+ System.err.println("");
+ System.err.println("General options:");
+ System.err.println("\t");
+ System.err.println(
+ "\t-linkdb <linkdb>\tuse LinkDb to index anchor texts of incoming links");
+ System.err.println(
+ "\t-params k1=v1&k2=v2...\tparameters passed to indexer plugins");
+ System.err.println(
+ "\t \t(via property indexer.additional.params)");
+ System.err.println("");
+ System.err.println(
+ "\t-noCommit\tdo not call the commit method of indexer plugins");
+ System.err.println(
+ "\t-deleteGone\tsend deletion requests for 404s, redirects, duplicates");
+ System.err
+ .println("\t-filter \tskip documents with URL rejected by configured URL filters");
+ System.err.println("\t-normalize\tnormalize URLs before indexing");
+ System.err.println(
+ "\t-addBinaryContent\tindex raw/binary content in field `binaryContent`");
+ System.err.println("\t-base64 \tuse Base64 encoding for binary content");
+ System.err.println("");
+ }
+
public int run(String[] args) throws Exception {
- if (args.length < 2) {
- System.err
- //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
- .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
+ if (args.length == 0) {
+ usage();
return -1;
}
- final Path crawlDb = new Path(args[0]);
+ Path crawlDb = null;
+ boolean noCrawlDb = false;
+
Path linkDb = null;
final List<Path> segments = new ArrayList<>();
@@ -188,10 +232,12 @@
boolean addBinaryContent = false;
boolean base64 = false;
- for (int i = 1; i < args.length; i++) {
+ for (int i = 0; i < args.length; i++) {
FileSystem fs = null;
Path dir = null;
- if (args[i].equals("-linkdb")) {
+ if (args[i].equals("-nocrawldb")) {
+ noCrawlDb = true;
+ } else if (args[i].equals("-linkdb")) {
linkDb = new Path(args[++i]);
} else if (args[i].equals("-dir")) {
dir = new Path(args[++i]);
@@ -218,7 +264,14 @@
base64 = true;
} else if (args[i].equals("-params")) {
params = args[++i];
+ } else if (crawlDb == null && !noCrawlDb) {
+ /*
+ * expect CrawlDb as first non-option argument unless -nocrawldb is
+ * given
+ */
+ crawlDb = new Path(args[i]);
} else {
+ // remaining arguments are segments
dir = new Path(args[i]);
fs = dir.getFileSystem(getConf());
if (SegmentChecker.isIndexable(dir,fs)) {
@@ -227,6 +280,12 @@
}
}
+ if (segments.size() == 0) {
+ usage();
+ System.err.println("No indexable segments passed as arguments. At least one segment is required!");
+ return -1;
+ }
+
try {
index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
return 0;