nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.parse;

 import java.lang.invoke.MethodHandles;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.StringUtil;

 /**
  * Parser checker, useful for testing parser. It also accurately reports
  * possible fetching and parsing failures and presents protocol status signals
  * to aid debugging. The tool enables us to retrieve the following data from any
  * url:
  * <ol>
  * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
  * type.</li>
  * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and
  * is used to remove duplicates during the dedup procedure. It is calculated
  * using {@link org.apache.nutch.crawl.MD5Signature} or
  * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
  * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
  * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
  * <li><tt>Title</tt>: of the URL</li>
  * <li><tt>Outlinks</tt>: associated with the URL</li>
  * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
  * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
  * <i>Cache-Control</>, etc.</li>
  * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
  * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
  * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
  * on <code>content.length</code> configuration.</li>
  * </ol>
  *
  * @author John Xing
  */

 public class ParserChecker implements Tool {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   private Configuration conf;

   public ParserChecker() {
   }

   public int run(String[] args) throws Exception {
     boolean dumpText = false;
     boolean force = false;
     String contentType = null;
     String url = null;

     String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";

     if (args.length == 0) {
       LOG.error(usage);
       return (-1);
     }

     // used to simulate the metadata propagated from injection
     HashMap<String, String> metadata = new HashMap<>();

     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-forceAs")) {
         force = true;
         contentType = args[++i];
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (args[i].equals("-md")) {
         String k = null, v = null;
         String nextOne = args[++i];
         int firstEquals = nextOne.indexOf("=");
         if (firstEquals != -1) {
           k = nextOne.substring(0, firstEquals);
           v = nextOne.substring(firstEquals + 1);
         } else
           k = nextOne;
         metadata.put(k, v);
       } else if (i != args.length - 1) {
         LOG.error(usage);
         throw new IllegalArgumentException(usage);
       } else {
         url = URLUtil.toASCII(args[i]);
       }
     }

     if (LOG.isInfoEnabled()) {
       LOG.info("fetching: " + url);
     }

     CrawlDatum cd = new CrawlDatum();

     Iterator<String> iter = metadata.keySet().iterator();
     while (iter.hasNext()) {
       String key = iter.next();
       String value = metadata.get(key);
       if (value == null)
         value = "";
       cd.getMetaData().put(new Text(key), new Text(value));
     }

     ProtocolFactory factory = new ProtocolFactory(conf);
     Protocol protocol = factory.getProtocol(url);
     Text turl = new Text(url);
     ProtocolOutput output = protocol.getProtocolOutput(turl, cd);

     // If the configuration permits, handle redirects until we either run
     // out of allowed redirects or we stop getting redirect statuses.
     int maxRedirects = conf.getInt("http.redirect.max", 0);
     int numRedirects = 0;
     while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
         String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
         LOG.info("Handling redirect to " + newURL);

         protocol = factory.getProtocol(newURL);
         turl = new Text(newURL);
         output = protocol.getProtocolOutput(turl, cd);

         numRedirects++;
     }

     if (!output.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + output.getStatus());

       if (output.getStatus().isRedirect()) {
           System.err.println("Redirect(s) not handled due to configuration.");
           System.err.println("Max Redirects to handle per config: " + maxRedirects);
           System.err.println("Number of Redirects handled: " + numRedirects);
       }
       return (-1);
     }

     Content content = output.getContent();

     if (content == null) {
       LOG.error("No content for " + url);
       return (-1);
     }

     if (force) {
       content.setContentType(contentType);
     } else {
       contentType = content.getContentType();
     }

     if (contentType == null) {
       LOG.error("Failed to determine content type!");
       return (-1);
     }

     if (ParseSegment.isTruncated(content)) {
       LOG.warn("Content is truncated, parse may fail!");
     }

     ScoringFilters scfilters = new ScoringFilters(conf);
     // call the scoring filters
     try {
       scfilters.passScoreBeforeParsing(turl, cd, content);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
             + ")");
         LOG.warn(StringUtils.stringifyException(e));
       }
     }

     ParseResult parseResult = new ParseUtil(conf).parse(content);

     if (parseResult == null) {
       LOG.error("Parsing content failed!");
       return (-1);
     }

     // Calculate the signature
     byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
         content, parseResult.get(new Text(url)));

     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);
       LOG.info("signature: " + StringUtil.toHexString(signature));
     }

     Parse parse = parseResult.get(turl);
     if (parse == null) {
       LOG.error("Failed to get parse from parse result");
       LOG.error("Available parses in parse result (by URL key):");
       for (Map.Entry<Text, Parse> entry : parseResult) {
         LOG.error("  " + entry.getKey());
       }
       LOG.error("Parse result does not contain a parse for URL to be checked:");
       LOG.error("  " + turl);
       return -1;
     }

     // call the scoring filters
     try {
       scfilters.passScoreAfterParsing(turl, content, parse);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e
             + ")");
         LOG.warn(StringUtils.stringifyException(e));
       }
     }

     for (Map.Entry<Text, Parse> entry : parseResult) {
       parse = entry.getValue();
       LOG.info("---------\nUrl\n---------------\n");
       System.out.print(entry.getKey());
       LOG.info("\n---------\nParseData\n---------\n");
       System.out.print(parse.getData().toString());
       if (dumpText) {
         LOG.info("---------\nParseText\n---------\n");
         System.out.print(parse.getText());
       }
     }

     return 0;
   }

   @Override
   public Configuration getConf() {
     return conf;
   }

   @Override
   public void setConf(Configuration c) {
     conf = c;
   }

   public static void main(String[] args) throws Exception {
     ToolRunner.run(NutchConfiguration.create(), new ParserChecker(), args);
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.parse;

	import java.lang.invoke.MethodHandles;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.Map;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.util.StringUtils;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;
	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.SignatureFactory;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.protocol.Protocol;
	import org.apache.nutch.protocol.ProtocolFactory;
	import org.apache.nutch.protocol.ProtocolOutput;
	import org.apache.nutch.scoring.ScoringFilters;
	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.nutch.util.URLUtil;
	import org.apache.nutch.util.StringUtil;

	/**
	* Parser checker, useful for testing parser. It also accurately reports
	* possible fetching and parsing failures and presents protocol status signals
	* to aid debugging. The tool enables us to retrieve the following data from any
	* url:
	* <ol>
	* <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
	* type.</li>
	* <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and
	* is used to remove duplicates during the dedup procedure. It is calculated
	* using {@link org.apache.nutch.crawl.MD5Signature} or
	* {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
	* <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
	* <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
	* <li><tt>Title</tt>: of the URL</li>
	* <li><tt>Outlinks</tt>: associated with the URL</li>
	* <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
	* <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
	* <i>Cache-Control</>, etc.</li>
	* <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
	* <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
	* <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
	* on <code>content.length</code> configuration.</li>
	* </ol>
	*
	* @author John Xing
	*/

	public class ParserChecker implements Tool {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());
	private Configuration conf;

	public ParserChecker() {
	}

	public int run(String[] args) throws Exception {
	boolean dumpText = false;
	boolean force = false;
	String contentType = null;
	String url = null;

	String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";

	if (args.length == 0) {
	LOG.error(usage);
	return (-1);
	}

	// used to simulate the metadata propagated from injection
	HashMap<String, String> metadata = new HashMap<>();

	for (int i = 0; i < args.length; i++) {
	if (args[i].equals("-forceAs")) {
	force = true;
	contentType = args[++i];
	} else if (args[i].equals("-dumpText")) {
	dumpText = true;
	} else if (args[i].equals("-md")) {
	String k = null, v = null;
	String nextOne = args[++i];
	int firstEquals = nextOne.indexOf("=");
	if (firstEquals != -1) {
	k = nextOne.substring(0, firstEquals);
	v = nextOne.substring(firstEquals + 1);
	} else
	k = nextOne;
	metadata.put(k, v);
	} else if (i != args.length - 1) {
	LOG.error(usage);
	throw new IllegalArgumentException(usage);
	} else {
	url = URLUtil.toASCII(args[i]);
	}
	}

	if (LOG.isInfoEnabled()) {
	LOG.info("fetching: " + url);
	}

	CrawlDatum cd = new CrawlDatum();

	Iterator<String> iter = metadata.keySet().iterator();
	while (iter.hasNext()) {
	String key = iter.next();
	String value = metadata.get(key);
	if (value == null)
	value = "";
	cd.getMetaData().put(new Text(key), new Text(value));
	}

	ProtocolFactory factory = new ProtocolFactory(conf);
	Protocol protocol = factory.getProtocol(url);
	Text turl = new Text(url);
	ProtocolOutput output = protocol.getProtocolOutput(turl, cd);

	// If the configuration permits, handle redirects until we either run
	// out of allowed redirects or we stop getting redirect statuses.
	int maxRedirects = conf.getInt("http.redirect.max", 0);
	int numRedirects = 0;
	while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
	String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
	LOG.info("Handling redirect to " + newURL);

	protocol = factory.getProtocol(newURL);
	turl = new Text(newURL);
	output = protocol.getProtocolOutput(turl, cd);

	numRedirects++;
	}

	if (!output.getStatus().isSuccess()) {
	System.err.println("Fetch failed with protocol status: "
	+ output.getStatus());

	if (output.getStatus().isRedirect()) {
	System.err.println("Redirect(s) not handled due to configuration.");
	System.err.println("Max Redirects to handle per config: " + maxRedirects);
	System.err.println("Number of Redirects handled: " + numRedirects);
	}
	return (-1);
	}

	Content content = output.getContent();

	if (content == null) {
	LOG.error("No content for " + url);
	return (-1);
	}

	if (force) {
	content.setContentType(contentType);
	} else {
	contentType = content.getContentType();
	}

	if (contentType == null) {
	LOG.error("Failed to determine content type!");
	return (-1);
	}

	if (ParseSegment.isTruncated(content)) {
	LOG.warn("Content is truncated, parse may fail!");
	}

	ScoringFilters scfilters = new ScoringFilters(conf);
	// call the scoring filters
	try {
	scfilters.passScoreBeforeParsing(turl, cd, content);
	} catch (Exception e) {
	if (LOG.isWarnEnabled()) {
	LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
	+ ")");
	LOG.warn(StringUtils.stringifyException(e));
	}
	}

	ParseResult parseResult = new ParseUtil(conf).parse(content);

	if (parseResult == null) {
	LOG.error("Parsing content failed!");
	return (-1);
	}

	// Calculate the signature
	byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
	content, parseResult.get(new Text(url)));

	if (LOG.isInfoEnabled()) {
	LOG.info("parsing: " + url);
	LOG.info("contentType: " + contentType);
	LOG.info("signature: " + StringUtil.toHexString(signature));
	}

	Parse parse = parseResult.get(turl);
	if (parse == null) {
	LOG.error("Failed to get parse from parse result");
	LOG.error("Available parses in parse result (by URL key):");
	for (Map.Entry<Text, Parse> entry : parseResult) {
	LOG.error(" " + entry.getKey());
	}
	LOG.error("Parse result does not contain a parse for URL to be checked:");
	LOG.error(" " + turl);
	return -1;
	}

	// call the scoring filters
	try {
	scfilters.passScoreAfterParsing(turl, content, parse);
	} catch (Exception e) {
	if (LOG.isWarnEnabled()) {
	LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e
	+ ")");
	LOG.warn(StringUtils.stringifyException(e));
	}
	}

	for (Map.Entry<Text, Parse> entry : parseResult) {
	parse = entry.getValue();
	LOG.info("---------\nUrl\n---------------\n");
	System.out.print(entry.getKey());
	LOG.info("\n---------\nParseData\n---------\n");
	System.out.print(parse.getData().toString());
	if (dumpText) {
	LOG.info("---------\nParseText\n---------\n");
	System.out.print(parse.getText());
	}
	}

	return 0;
	}

	@Override
	public Configuration getConf() {
	return conf;
	}

	@Override
	public void setConf(Configuration c) {
	conf = c;
	}

	public static void main(String[] args) throws Exception {
	ToolRunner.run(NutchConfiguration.create(), new ParserChecker(), args);
	}

	}