blob: b2db2289fc75bff162ef90b85b134dfc5c1a667b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.file;
import java.net.URL;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.tika.Tika;
import org.apache.hadoop.conf.Configuration;
/************************************
* FileResponse.java mimics file replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
*
* Comments: (1) java.net.URL and java.net.URLConnection can handle file:
* scheme. However they are not flexible enough, so not used in this
* implementation.
*
* (2) java.io.File is used for its abstractness across platforms. Warning:
* java.io.File API (1.4.2) does not elaborate on how special files, such as
* /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
* java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
* return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
* probably oaky for now. Could be buggy here. How about special files on
* windows?
*
* (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
* are just treated as individual files.
*
* (4) No funcy POSIX file attributes yet. May never need?
*
* @author John Xing
***********************************/
public class FileResponse {
private String orig;
private String base;
private byte[] content;
private static final byte[] EMPTY_CONTENT = new byte[0];
private int code;
private Metadata headers = new Metadata();
private final File file;
private Configuration conf;
private MimeUtil MIME;
private Tika tika;
/** Returns the response code. */
public int getCode() {
return code;
}
/** Returns the value of a named header. */
public String getHeader(String name) {
return headers.get(name);
}
public byte[] getContent() {
return content;
}
public Content toContent() {
return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
getHeader(Response.CONTENT_TYPE), headers, this.conf);
}
/**
* Default public constructor
*
* @param url
* @param datum
* @param file
* @param conf
* @throws FileException
* @throws IOException
*/
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
throws FileException, IOException {
this.orig = url.toString();
this.base = url.toString();
this.file = file;
this.conf = conf;
MIME = new MimeUtil(conf);
tika = new Tika();
if (!"file".equals(url.getProtocol()))
throw new FileException("Not a file url:" + url);
if (File.LOG.isTraceEnabled()) {
File.LOG.trace("fetching " + url);
}
if (url.getPath() != url.getFile()) {
if (File.LOG.isWarnEnabled()) {
File.LOG.warn("url.getPath() != url.getFile(): " + url);
}
}
String path = "".equals(url.getPath()) ? "/" : url.getPath();
try {
// specify the encoding via the config later?
path = java.net.URLDecoder.decode(path, "UTF-8");
} catch (UnsupportedEncodingException ex) {
}
try {
this.content = null;
// url.toURI() is only in j2se 1.5.0
// java.io.File f = new java.io.File(url.toURI());
java.io.File f = new java.io.File(path);
if (!f.exists()) {
this.code = 404; // http Not Found
return;
}
if (!f.canRead()) {
this.code = 401; // http Unauthorized
return;
}
// symbolic link or relative path on unix
// fix me: what's the consequence on windows platform
// where case is insensitive
if (!f.equals(f.getCanonicalFile())) {
// set headers
// hdrs.put("Location", f.getCanonicalFile().toURI());
//
// we want to automatically escape characters that are illegal in URLs.
// It is recommended that new code convert an abstract pathname into a
// URL
// by first converting it into a URI, via the toURI method, and then
// converting the URI into a URL via the URI.toURL method.
headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
.toString());
this.code = 300; // http redirect
return;
}
if (f.lastModified() <= datum.getModifiedTime()) {
this.code = 304;
this.headers.set("Last-Modified",
HttpDateFormat.toString(f.lastModified()));
return;
}
if (f.isDirectory()) {
getDirAsHttpResponse(f);
} else if (f.isFile()) {
getFileAsHttpResponse(f);
} else {
this.code = 500; // http Internal Server Error
return;
}
} catch (IOException e) {
throw e;
}
}
// get file as http response
private void getFileAsHttpResponse(java.io.File f) throws FileException,
IOException {
// ignore file of size larger than
// Integer.MAX_VALUE = 2^31-1 = 2147483647
long size = f.length();
if (size > Integer.MAX_VALUE) {
throw new FileException("file is too large, size: " + size);
// or we can do this?
// this.code = 400; // http Bad request
// return;
}
// capture content
int len = (int) size;
if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
len = this.file.maxContentLength;
this.content = new byte[len];
java.io.InputStream is = new java.io.FileInputStream(f);
int offset = 0;
int n = 0;
while (offset < len
&& (n = is.read(this.content, offset, len - offset)) >= 0) {
offset += n;
}
if (offset < len) { // keep whatever already have, but issue a warning
if (File.LOG.isWarnEnabled()) {
File.LOG.warn("not enough bytes read from file: " + f.getPath());
}
}
is.close();
// set headers
headers.set(Response.CONTENT_LENGTH, Long.valueOf(size).toString());
headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(f.lastModified()));
String mimeType = tika.detect(f);
headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
// response code
this.code = 200; // http OK
}
/**
* get dir list as http response
*
* @param f
* @throws IOException
*/
private void getDirAsHttpResponse(java.io.File f) throws IOException {
String path = f.toString();
if (this.file.crawlParents)
this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
: true);
else
this.content = list2html(f.listFiles(), path, false);
// set headers
headers.set(Response.CONTENT_LENGTH,
Integer.valueOf(this.content.length).toString());
headers.set(Response.CONTENT_TYPE, "text/html");
headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(f.lastModified()));
// response code
this.code = 200; // http OK
}
/**
* generate html page from dir list
*
* @param list
* @param path
* @param includeDotDot
* @return
*/
private byte[] list2html(java.io.File[] list, String path,
boolean includeDotDot) {
StringBuffer x = new StringBuffer("<html><head>");
x.append("<title>Index of " + path + "</title></head>\n");
x.append("<body><h1>Index of " + path + "</h1><pre>\n");
if (includeDotDot) {
x.append("<a href='../'>../</a>\t-\t-\t-\n");
}
// fix me: we might want to sort list here! but not now.
java.io.File f;
for (int i = 0; i < list.length; i++) {
f = list[i];
String name = f.getName();
String time = HttpDateFormat.toString(f.lastModified());
if (f.isDirectory()) {
// java 1.4.2 api says dir itself and parent dir are not listed
// so the following is not needed.
// if (name.equals(".") || name.equals(".."))
// continue;
x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
x.append(time + "\t-\n");
} else if (f.isFile()) {
x.append("<a href='" + name + "'>" + name + "</a>\t");
x.append(time + "\t" + f.length() + "\n");
} else {
// ignore any other
}
}
x.append("</pre></body></html>\n");
return new String(x).getBytes();
}
}