src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.protocol.file;

 import java.lang.invoke.MethodHandles;
 import java.net.URL;
 import java.util.List;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;

 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.protocol.RobotRulesParser;
 import org.apache.nutch.util.NutchConfiguration;

 import crawlercommons.robots.BaseRobotRules;

 /**
  * This class is a protocol plugin used for file: scheme. It creates
  * {@link FileResponse} object and gets the content of the url from it.
  * Configurable parameters are {@code file.content.limit} and
  * {@code file.crawl.parent} in nutch-default.xml defined under
  * "file properties" section.
  *
  * @author John Xing
  */
 public class File implements Protocol {

   protected static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   static final int MAX_REDIRECTS = 5;

   int maxContentLength;
   boolean crawlParents;

   /**
    * if true return a redirect for symbolic links and do not resolve the links
    * internally
    */
   boolean symlinksAsRedirects = true;

   private Configuration conf;

   public File() {
   }

   /**
    * Set the {@link Configuration} object
    */
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
     this.crawlParents = conf.getBoolean("file.crawl.parent", true);
     this.symlinksAsRedirects = conf.getBoolean(
         "file.crawl.redirect_noncanonical", true);
   }

   /**
    * Get the {@link Configuration} object
    */
   public Configuration getConf() {
     return this.conf;
   }

   /**
    * Set the length after at which content is truncated.
    */
   public void setMaxContentLength(int maxContentLength) {
     this.maxContentLength = maxContentLength;
   }

   /**
    * Creates a {@link FileResponse} object corresponding to the url and return a
    * {@link ProtocolOutput} object as per the content received
    *
    * @param url
    *          Text containing the url
    * @param datum
    *          The CrawlDatum object corresponding to the url
    *
    * @return {@link ProtocolOutput} object for the content of the file indicated
    *         by url
    */
   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);

       int redirects = 0;

       while (true) {
         FileResponse response;
         response = new FileResponse(u, datum, this, getConf()); // make a
                                                                 // request

         int code = response.getCode();

         if (code == 200) { // got a good response
           return new ProtocolOutput(response.toContent()); // return it

         } else if (code == 304) { // got not modified
           return new ProtocolOutput(response.toContent(),
               ProtocolStatus.STATUS_NOTMODIFIED);

         } else if (code == 401) { // access denied / no read permissions
           return new ProtocolOutput(response.toContent(), new ProtocolStatus(
               ProtocolStatus.ACCESS_DENIED));

         } else if (code == 404) { // no such file
           return new ProtocolOutput(response.toContent(),
               ProtocolStatus.STATUS_NOTFOUND);

         } else if (code >= 300 && code < 400) { // handle redirect
           u = new URL(response.getHeader("Location"));
           if (LOG.isTraceEnabled()) {
             LOG.trace("redirect to " + u);
           }
           if (symlinksAsRedirects) {
             return new ProtocolOutput(response.toContent(), new ProtocolStatus(
                 ProtocolStatus.MOVED, u));
           } else if (redirects == MAX_REDIRECTS) {
             LOG.trace("Too many redirects: {}", url);
             return new ProtocolOutput(response.toContent(), new ProtocolStatus(
                 ProtocolStatus.REDIR_EXCEEDED, u));
           }
           redirects++;

         } else { // convert to exception
           throw new FileError(code);
         }
       }
     } catch (Exception e) {
       e.printStackTrace();
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }

   /**
    * Quick way for running this class. Useful for debugging.
    */
   public static void main(String[] args) throws Exception {
     int maxContentLength = Integer.MIN_VALUE;
     boolean dumpContent = false;
     String urlString = null;

     String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";

     if (args.length == 0) {
       System.err.println(usage);
       System.exit(-1);
     }

     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-maxContentLength")) {
         maxContentLength = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-dumpContent")) {
         dumpContent = true;
       } else if (i != args.length - 1) {
         System.err.println(usage);
         System.exit(-1);
       } else
         urlString = args[i];
     }

     File file = new File();
     file.setConf(NutchConfiguration.create());

     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
       file.setMaxContentLength(maxContentLength);

     // set log level
     // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

     ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
         new CrawlDatum());
     Content content = output.getContent();

     System.err.println("URL: " + content.getUrl());
     System.err.println("Status: " + output.getStatus());
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: "
         + content.getMetadata().get(Response.CONTENT_LENGTH));
     System.err.println("Last-Modified: "
         + content.getMetadata().get(Response.LAST_MODIFIED));
     String redirectLocation = content.getMetadata().get("Location");
     if (redirectLocation != null) {
       System.err.println("Location: " + redirectLocation);
     }

     if (dumpContent) {
       System.out.print(new String(content.getContent()));
     }

     file = null;
   }

   /**
    * No robots parsing is done for file protocol. So this returns a set of empty
    * rules which will allow every url.
    */
   @Override
   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
       List<Content> robotsTxtContent) {
     return RobotRulesParser.EMPTY_RULES;
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.protocol.file;

	import java.lang.invoke.MethodHandles;
	import java.net.URL;
	import java.util.List;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;

	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.net.protocols.Response;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.protocol.Protocol;
	import org.apache.nutch.protocol.ProtocolOutput;
	import org.apache.nutch.protocol.ProtocolStatus;
	import org.apache.nutch.protocol.RobotRulesParser;
	import org.apache.nutch.util.NutchConfiguration;

	import crawlercommons.robots.BaseRobotRules;

	/**
	* This class is a protocol plugin used for file: scheme. It creates
	* {@link FileResponse} object and gets the content of the url from it.
	* Configurable parameters are {@code file.content.limit} and
	* {@code file.crawl.parent} in nutch-default.xml defined under
	* "file properties" section.
	*
	* @author John Xing
	*/
	public class File implements Protocol {

	protected static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	static final int MAX_REDIRECTS = 5;

	int maxContentLength;
	boolean crawlParents;

	/**
	* if true return a redirect for symbolic links and do not resolve the links
	* internally
	*/
	boolean symlinksAsRedirects = true;

	private Configuration conf;

	public File() {
	}

	/**
	* Set the {@link Configuration} object
	*/
	public void setConf(Configuration conf) {
	this.conf = conf;
	this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
	this.crawlParents = conf.getBoolean("file.crawl.parent", true);
	this.symlinksAsRedirects = conf.getBoolean(
	"file.crawl.redirect_noncanonical", true);
	}

	/**
	* Get the {@link Configuration} object
	*/
	public Configuration getConf() {
	return this.conf;
	}

	/**
	* Set the length after at which content is truncated.
	*/
	public void setMaxContentLength(int maxContentLength) {
	this.maxContentLength = maxContentLength;
	}

	/**
	* Creates a {@link FileResponse} object corresponding to the url and return a
	* {@link ProtocolOutput} object as per the content received
	*
	* @param url
	* Text containing the url
	* @param datum
	* The CrawlDatum object corresponding to the url
	*
	* @return {@link ProtocolOutput} object for the content of the file indicated
	* by url
	*/
	public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
	String urlString = url.toString();
	try {
	URL u = new URL(urlString);

	int redirects = 0;

	while (true) {
	FileResponse response;
	response = new FileResponse(u, datum, this, getConf()); // make a
	// request

	int code = response.getCode();

	if (code == 200) { // got a good response
	return new ProtocolOutput(response.toContent()); // return it

	} else if (code == 304) { // got not modified
	return new ProtocolOutput(response.toContent(),
	ProtocolStatus.STATUS_NOTMODIFIED);

	} else if (code == 401) { // access denied / no read permissions
	return new ProtocolOutput(response.toContent(), new ProtocolStatus(
	ProtocolStatus.ACCESS_DENIED));

	} else if (code == 404) { // no such file
	return new ProtocolOutput(response.toContent(),
	ProtocolStatus.STATUS_NOTFOUND);

	} else if (code >= 300 && code < 400) { // handle redirect
	u = new URL(response.getHeader("Location"));
	if (LOG.isTraceEnabled()) {
	LOG.trace("redirect to " + u);
	}
	if (symlinksAsRedirects) {
	return new ProtocolOutput(response.toContent(), new ProtocolStatus(
	ProtocolStatus.MOVED, u));
	} else if (redirects == MAX_REDIRECTS) {
	LOG.trace("Too many redirects: {}", url);
	return new ProtocolOutput(response.toContent(), new ProtocolStatus(
	ProtocolStatus.REDIR_EXCEEDED, u));
	}
	redirects++;

	} else { // convert to exception
	throw new FileError(code);
	}
	}
	} catch (Exception e) {
	e.printStackTrace();
	return new ProtocolOutput(null, new ProtocolStatus(e));
	}
	}

	/**
	* Quick way for running this class. Useful for debugging.
	*/
	public static void main(String[] args) throws Exception {
	int maxContentLength = Integer.MIN_VALUE;
	boolean dumpContent = false;
	String urlString = null;

	String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";

	if (args.length == 0) {
	System.err.println(usage);
	System.exit(-1);
	}

	for (int i = 0; i < args.length; i++) {
	if (args[i].equals("-maxContentLength")) {
	maxContentLength = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-dumpContent")) {
	dumpContent = true;
	} else if (i != args.length - 1) {
	System.err.println(usage);
	System.exit(-1);
	} else
	urlString = args[i];
	}

	File file = new File();
	file.setConf(NutchConfiguration.create());

	if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
	file.setMaxContentLength(maxContentLength);

	// set log level
	// LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

	ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
	new CrawlDatum());
	Content content = output.getContent();

	System.err.println("URL: " + content.getUrl());
	System.err.println("Status: " + output.getStatus());
	System.err.println("Content-Type: " + content.getContentType());
	System.err.println("Content-Length: "
	+ content.getMetadata().get(Response.CONTENT_LENGTH));
	System.err.println("Last-Modified: "
	+ content.getMetadata().get(Response.LAST_MODIFIED));
	String redirectLocation = content.getMetadata().get("Location");
	if (redirectLocation != null) {
	System.err.println("Location: " + redirectLocation);
	}

	if (dumpContent) {
	System.out.print(new String(content.getContent()));
	}

	file = null;
	}

	/**
	* No robots parsing is done for file protocol. So this returns a set of empty
	* rules which will allow every url.
	*/
	@Override
	public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
	List<Content> robotsTxtContent) {
	return RobotRulesParser.EMPTY_RULES;
	}

	}