src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.protocol.file;

 import java.net.URL;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;

 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;

 import org.apache.tika.Tika;

 import org.apache.hadoop.conf.Configuration;

 /************************************
  * FileResponse.java mimics file replies as http response. It tries its best to
  * follow http's way for headers, response codes as well as exceptions.
  *
  * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
  * scheme. However they are not flexible enough, so not used in this
  * implementation.
  *
  * (2) java.io.File is used for its abstractness across platforms. Warning:
  * java.io.File API (1.4.2) does not elaborate on how special files, such as
  * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
  * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
  * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
  * probably oaky for now. Could be buggy here. How about special files on
  * windows?
  *
  * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
  * are just treated as individual files.
  *
  * (4) No funcy POSIX file attributes yet. May never need?
  *
  * @author John Xing
  ***********************************/
 public class FileResponse {

   private String orig;
   private String base;
   private byte[] content;
   private static final byte[] EMPTY_CONTENT = new byte[0];
   private int code;
   private Metadata headers = new Metadata();

   private final File file;
   private Configuration conf;

   private MimeUtil MIME;
   private Tika tika;

   /** Returns the response code. */
   public int getCode() {
     return code;
   }

   /** Returns the value of a named header. */
   public String getHeader(String name) {
     return headers.get(name);
   }

   public byte[] getContent() {
     return content;
   }

   public Content toContent() {
     return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
         getHeader(Response.CONTENT_TYPE), headers, this.conf);
   }

   /**
    * Default public constructor
    *
    * @param url
    * @param datum
    * @param file
    * @param conf
    * @throws FileException
    * @throws IOException
    */
   public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
       throws FileException, IOException {

     this.orig = url.toString();
     this.base = url.toString();
     this.file = file;
     this.conf = conf;

     MIME = new MimeUtil(conf);
     tika = new Tika();

     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);

     if (File.LOG.isTraceEnabled()) {
       File.LOG.trace("fetching " + url);
     }

     if (url.getPath() != url.getFile()) {
       if (File.LOG.isWarnEnabled()) {
         File.LOG.warn("url.getPath() != url.getFile(): " + url);
       }
     }

     String path = "".equals(url.getPath()) ? "/" : url.getPath();

     try {
       // specify the encoding via the config later?
       path = java.net.URLDecoder.decode(path, "UTF-8");
     } catch (UnsupportedEncodingException ex) {
     }

     try {

       this.content = null;

       // url.toURI() is only in j2se 1.5.0
       // java.io.File f = new java.io.File(url.toURI());
       java.io.File f = new java.io.File(path);

       if (!f.exists()) {
         this.code = 404; // http Not Found
         return;
       }

       if (!f.canRead()) {
         this.code = 401; // http Unauthorized
         return;
       }

       // symbolic link or relative path on unix
       // fix me: what's the consequence on windows platform
       // where case is insensitive
       if (!f.equals(f.getCanonicalFile())) {
         // set headers
         // hdrs.put("Location", f.getCanonicalFile().toURI());
         //
         // we want to automatically escape characters that are illegal in URLs.
         // It is recommended that new code convert an abstract pathname into a
         // URL
         // by first converting it into a URI, via the toURI method, and then
         // converting the URI into a URL via the URI.toURL method.
         headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
             .toString());

         this.code = 300; // http redirect
         return;
       }
       if (f.lastModified() <= datum.getModifiedTime()) {
         this.code = 304;
         this.headers.set("Last-Modified",
             HttpDateFormat.toString(f.lastModified()));
         return;
       }

       if (f.isDirectory()) {
         getDirAsHttpResponse(f);
       } else if (f.isFile()) {
         getFileAsHttpResponse(f);
       } else {
         this.code = 500; // http Internal Server Error
         return;
       }

     } catch (IOException e) {
       throw e;
     }

   }

   // get file as http response
   private void getFileAsHttpResponse(java.io.File f) throws FileException,
       IOException {

     // ignore file of size larger than
     // Integer.MAX_VALUE = 2^31-1 = 2147483647
     long size = f.length();
     if (size > Integer.MAX_VALUE) {
       throw new FileException("file is too large, size: " + size);
       // or we can do this?
       // this.code = 400; // http Bad request
       // return;
     }

     // capture content
     int len = (int) size;

     if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
       len = this.file.maxContentLength;

     this.content = new byte[len];

     java.io.InputStream is = new java.io.FileInputStream(f);
     int offset = 0;
     int n = 0;
     while (offset < len
         && (n = is.read(this.content, offset, len - offset)) >= 0) {
       offset += n;
     }
     if (offset < len) { // keep whatever already have, but issue a warning
       if (File.LOG.isWarnEnabled()) {
         File.LOG.warn("not enough bytes read from file: " + f.getPath());
       }
     }
     is.close();

     // set headers
     headers.set(Response.CONTENT_LENGTH, Long.valueOf(size).toString());
     headers.set(Response.LAST_MODIFIED,
         HttpDateFormat.toString(f.lastModified()));

     String mimeType = tika.detect(f);

     headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

     // response code
     this.code = 200; // http OK
   }

   /**
    * get dir list as http response
    *
    * @param f
    * @throws IOException
    */
   private void getDirAsHttpResponse(java.io.File f) throws IOException {

     String path = f.toString();
     if (this.file.crawlParents)
       this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
           : true);
     else
       this.content = list2html(f.listFiles(), path, false);

     // set headers
     headers.set(Response.CONTENT_LENGTH,
         Integer.valueOf(this.content.length).toString());
     headers.set(Response.CONTENT_TYPE, "text/html");
     headers.set(Response.LAST_MODIFIED,
         HttpDateFormat.toString(f.lastModified()));

     // response code
     this.code = 200; // http OK
   }

   /**
    * generate html page from dir list
    *
    * @param list
    * @param path
    * @param includeDotDot
    * @return
    */
   private byte[] list2html(java.io.File[] list, String path,
       boolean includeDotDot) {

     StringBuffer x = new StringBuffer("<html><head>");
     x.append("<title>Index of " + path + "</title></head>\n");
     x.append("<body><h1>Index of " + path + "</h1><pre>\n");

     if (includeDotDot) {
       x.append("<a href='../'>../</a>\t-\t-\t-\n");
     }

     // fix me: we might want to sort list here! but not now.

     java.io.File f;
     for (int i = 0; i < list.length; i++) {
       f = list[i];
       String name = f.getName();
       String time = HttpDateFormat.toString(f.lastModified());
       if (f.isDirectory()) {
         // java 1.4.2 api says dir itself and parent dir are not listed
         // so the following is not needed.
         // if (name.equals(".") || name.equals(".."))
         // continue;
         x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
         x.append(time + "\t-\n");
       } else if (f.isFile()) {
         x.append("<a href='" + name + "'>" + name + "</a>\t");
         x.append(time + "\t" + f.length() + "\n");
       } else {
         // ignore any other
       }
     }

     x.append("</pre></body></html>\n");

     return new String(x).getBytes();
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.protocol.file;

	import java.net.URL;
	import java.io.IOException;
	import java.io.UnsupportedEncodingException;

	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.MimeUtil;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.net.protocols.HttpDateFormat;
	import org.apache.nutch.net.protocols.Response;

	import org.apache.tika.Tika;

	import org.apache.hadoop.conf.Configuration;

	/************************************
	* FileResponse.java mimics file replies as http response. It tries its best to
	* follow http's way for headers, response codes as well as exceptions.
	*
	* Comments: (1) java.net.URL and java.net.URLConnection can handle file:
	* scheme. However they are not flexible enough, so not used in this
	* implementation.
	*
	* (2) java.io.File is used for its abstractness across platforms. Warning:
	* java.io.File API (1.4.2) does not elaborate on how special files, such as
	* /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
	* java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
	* return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
	* probably oaky for now. Could be buggy here. How about special files on
	* windows?
	*
	* (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
	* are just treated as individual files.
	*
	* (4) No funcy POSIX file attributes yet. May never need?
	*
	* @author John Xing
	***********************************/
	public class FileResponse {

	private String orig;
	private String base;
	private byte[] content;
	private static final byte[] EMPTY_CONTENT = new byte[0];
	private int code;
	private Metadata headers = new Metadata();

	private final File file;
	private Configuration conf;

	private MimeUtil MIME;
	private Tika tika;

	/** Returns the response code. */
	public int getCode() {
	return code;
	}

	/** Returns the value of a named header. */
	public String getHeader(String name) {
	return headers.get(name);
	}

	public byte[] getContent() {
	return content;
	}

	public Content toContent() {
	return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
	getHeader(Response.CONTENT_TYPE), headers, this.conf);
	}

	/**
	* Default public constructor
	*
	* @param url
	* @param datum
	* @param file
	* @param conf
	* @throws FileException
	* @throws IOException
	*/
	public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
	throws FileException, IOException {

	this.orig = url.toString();
	this.base = url.toString();
	this.file = file;
	this.conf = conf;

	MIME = new MimeUtil(conf);
	tika = new Tika();

	if (!"file".equals(url.getProtocol()))
	throw new FileException("Not a file url:" + url);

	if (File.LOG.isTraceEnabled()) {
	File.LOG.trace("fetching " + url);
	}

	if (url.getPath() != url.getFile()) {
	if (File.LOG.isWarnEnabled()) {
	File.LOG.warn("url.getPath() != url.getFile(): " + url);
	}
	}

	String path = "".equals(url.getPath()) ? "/" : url.getPath();

	try {
	// specify the encoding via the config later?
	path = java.net.URLDecoder.decode(path, "UTF-8");
	} catch (UnsupportedEncodingException ex) {
	}

	try {

	this.content = null;

	// url.toURI() is only in j2se 1.5.0
	// java.io.File f = new java.io.File(url.toURI());
	java.io.File f = new java.io.File(path);

	if (!f.exists()) {
	this.code = 404; // http Not Found
	return;
	}

	if (!f.canRead()) {
	this.code = 401; // http Unauthorized
	return;
	}

	// symbolic link or relative path on unix
	// fix me: what's the consequence on windows platform
	// where case is insensitive
	if (!f.equals(f.getCanonicalFile())) {
	// set headers
	// hdrs.put("Location", f.getCanonicalFile().toURI());
	//
	// we want to automatically escape characters that are illegal in URLs.
	// It is recommended that new code convert an abstract pathname into a
	// URL
	// by first converting it into a URI, via the toURI method, and then
	// converting the URI into a URL via the URI.toURL method.
	headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
	.toString());

	this.code = 300; // http redirect
	return;
	}
	if (f.lastModified() <= datum.getModifiedTime()) {
	this.code = 304;
	this.headers.set("Last-Modified",
	HttpDateFormat.toString(f.lastModified()));
	return;
	}

	if (f.isDirectory()) {
	getDirAsHttpResponse(f);
	} else if (f.isFile()) {
	getFileAsHttpResponse(f);
	} else {
	this.code = 500; // http Internal Server Error
	return;
	}

	} catch (IOException e) {
	throw e;
	}

	}

	// get file as http response
	private void getFileAsHttpResponse(java.io.File f) throws FileException,
	IOException {

	// ignore file of size larger than
	// Integer.MAX_VALUE = 2^31-1 = 2147483647
	long size = f.length();
	if (size > Integer.MAX_VALUE) {
	throw new FileException("file is too large, size: " + size);
	// or we can do this?
	// this.code = 400; // http Bad request
	// return;
	}

	// capture content
	int len = (int) size;

	if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
	len = this.file.maxContentLength;

	this.content = new byte[len];

	java.io.InputStream is = new java.io.FileInputStream(f);
	int offset = 0;
	int n = 0;
	while (offset < len
	&& (n = is.read(this.content, offset, len - offset)) >= 0) {
	offset += n;
	}
	if (offset < len) { // keep whatever already have, but issue a warning
	if (File.LOG.isWarnEnabled()) {
	File.LOG.warn("not enough bytes read from file: " + f.getPath());
	}
	}
	is.close();

	// set headers
	headers.set(Response.CONTENT_LENGTH, Long.valueOf(size).toString());
	headers.set(Response.LAST_MODIFIED,
	HttpDateFormat.toString(f.lastModified()));

	String mimeType = tika.detect(f);

	headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

	// response code
	this.code = 200; // http OK
	}

	/**
	* get dir list as http response
	*
	* @param f
	* @throws IOException
	*/
	private void getDirAsHttpResponse(java.io.File f) throws IOException {

	String path = f.toString();
	if (this.file.crawlParents)
	this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
	: true);
	else
	this.content = list2html(f.listFiles(), path, false);

	// set headers
	headers.set(Response.CONTENT_LENGTH,
	Integer.valueOf(this.content.length).toString());
	headers.set(Response.CONTENT_TYPE, "text/html");
	headers.set(Response.LAST_MODIFIED,
	HttpDateFormat.toString(f.lastModified()));

	// response code
	this.code = 200; // http OK
	}

	/**
	* generate html page from dir list
	*
	* @param list
	* @param path
	* @param includeDotDot
	* @return
	*/
	private byte[] list2html(java.io.File[] list, String path,
	boolean includeDotDot) {

	StringBuffer x = new StringBuffer("<html><head>");
	x.append("<title>Index of " + path + "</title></head>\n");
	x.append("<body><h1>Index of " + path + "</h1><pre>\n");

	if (includeDotDot) {
	x.append("<a href='../'>../</a>\t-\t-\t-\n");
	}

	// fix me: we might want to sort list here! but not now.

	java.io.File f;
	for (int i = 0; i < list.length; i++) {
	f = list[i];
	String name = f.getName();
	String time = HttpDateFormat.toString(f.lastModified());
	if (f.isDirectory()) {
	// java 1.4.2 api says dir itself and parent dir are not listed
	// so the following is not needed.
	// if (name.equals(".") \|\| name.equals(".."))
	// continue;
	x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
	x.append(time + "\t-\n");
	} else if (f.isFile()) {
	x.append("<a href='" + name + "'>" + name + "</a>\t");
	x.append(time + "\t" + f.length() + "\n");
	} else {
	// ignore any other
	}
	}

	x.append("</pre></body></html>\n");

	return new String(x).getBytes();
	}

	}