blob: 04512019ffce74f97784d76587f55ad64006a373 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.ftp;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.commons.net.ftp.FTPReply;
import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
import org.apache.commons.net.ftp.parser.ParserInitializationException;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;
import java.net.InetAddress;
import java.net.URL;
import java.util.List;
import java.util.LinkedList;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
/**
* FtpResponse.java mimics ftp replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
*
* Comments: In this class, all FtpException*.java thrown by Client.java and
* some important commons-net exceptions passed by Client.java must have been
* properly dealt with. They'd better not be leaked to the caller of this class.
*/
public class FtpResponse {
private String orig;
private String base;
private byte[] content;
private static final byte[] EMPTY_CONTENT = new byte[0];
private int code;
private Metadata headers = new Metadata();
private final Ftp ftp;
private Configuration conf;
/** Returns the response code. */
public int getCode() {
return code;
}
/** Returns the value of a named header. */
public String getHeader(String name) {
return headers.get(name);
}
public byte[] getContent() {
return content;
}
public Content toContent() {
return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
getHeader(Response.CONTENT_TYPE), headers, this.conf);
}
public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
throws FtpException, IOException {
this.orig = url.toString();
this.base = url.toString();
this.ftp = ftp;
this.conf = conf;
if (!"ftp".equals(url.getProtocol()))
throw new FtpException("Not a ftp url:" + url);
if (url.getPath() != url.getFile()) {
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
}
}
String path = "".equals(url.getPath()) ? "/" : url.getPath();
try {
if (ftp.followTalk) {
if (Ftp.LOG.isInfoEnabled()) {
Ftp.LOG.info("fetching " + url);
}
} else {
if (Ftp.LOG.isTraceEnabled()) {
Ftp.LOG.trace("fetching " + url);
}
}
InetAddress addr = InetAddress.getByName(url.getHost());
if (addr != null && conf.getBoolean("store.ip.address", false) == true) {
headers.add("_ip_", addr.getHostAddress());
}
// idled too long, remote server or ourselves may have timed out,
// should start anew.
if (ftp.client != null && ftp.keepConnection
&& ftp.renewalTime < System.currentTimeMillis()) {
if (Ftp.LOG.isInfoEnabled()) {
Ftp.LOG.info("delete client because idled too long");
}
ftp.client = null;
}
// start anew if needed
if (ftp.client == null) {
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("start client");
}
// the real client
ftp.client = new Client();
// when to renew, take the lesser
// ftp.renewalTime = System.currentTimeMillis()
// + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout :
// ftp.serverTimeout);
// timeout for control connection
ftp.client.setDefaultTimeout(ftp.timeout);
// timeout for data connection
ftp.client.setDataTimeout(ftp.timeout);
// follow ftp talk?
if (ftp.followTalk)
ftp.client.addProtocolCommandListener(new PrintCommandListener(
Ftp.LOG));
}
// quit from previous site if at a different site now
if (ftp.client.isConnected()) {
InetAddress remoteAddress = ftp.client.getRemoteAddress();
if (!addr.equals(remoteAddress)) {
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("disconnect from " + remoteAddress
+ " before connect to " + addr);
}
// quit from current site
ftp.client.logout();
ftp.client.disconnect();
}
}
// connect to current site if needed
if (!ftp.client.isConnected()) {
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("connect to " + addr);
}
ftp.client.connect(addr);
if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
ftp.client.disconnect();
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " "
+ ftp.client.getReplyString());
}
this.code = 500; // http Internal Server Error
return;
}
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("log into " + addr);
}
if (!ftp.client.login(ftp.userName, ftp.passWord)) {
// login failed.
// please note that some server may return 421 immediately
// after USER anonymous, thus ftp.client.login() won't return false,
// but throw exception, which then will be handled by caller
// (not dealt with here at all) .
ftp.client.disconnect();
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("ftp.client.login() failed: " + addr);
}
this.code = 401; // http Unauthorized
return;
}
// insist on binary file type
if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
ftp.client.logout();
ftp.client.disconnect();
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr);
}
this.code = 500; // http Internal Server Error
return;
}
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("set parser for " + addr);
}
// SYST is valid only after login
try {
ftp.parser = null;
String parserKey = ftp.client.getSystemName();
// some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
if (parserKey.startsWith("UNKNOWN Type: L8"))
parserKey = "UNIX Type: L8";
ftp.parser = (new DefaultFTPFileEntryParserFactory())
.createFileEntryParser(parserKey);
} catch (FtpExceptionBadSystResponse e) {
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG
.warn("ftp.client.getSystemName() failed: " + addr + " " + e);
}
ftp.parser = null;
} catch (ParserInitializationException e) {
// ParserInitializationException is RuntimeException defined in
// org.apache.commons.net.ftp.parser.ParserInitializationException
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e);
}
ftp.parser = null;
} finally {
if (ftp.parser == null) {
// do not log as severe, otherwise
// FetcherThread/RequestScheduler will abort
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("ftp.parser is null: " + addr);
}
ftp.client.logout();
ftp.client.disconnect();
this.code = 500; // http Internal Server Error
return;
}
}
} else {
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("use existing connection");
}
}
this.content = null;
path = java.net.URLDecoder.decode(path, "UTF-8");
if (path.endsWith("/")) {
getDirAsHttpResponse(path, datum.getModifiedTime());
} else {
getFileAsHttpResponse(path, datum.getModifiedTime());
}
// reset next renewalTime, take the lesser
if (ftp.client != null && ftp.keepConnection) {
ftp.renewalTime = System.currentTimeMillis()
+ ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout
: ftp.serverTimeout);
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("reset renewalTime to "
+ HttpDateFormat.toString(ftp.renewalTime));
}
}
// getDirAsHttpResponse() or getFileAsHttpResponse() above
// may have deleted ftp.client
if (ftp.client != null && !ftp.keepConnection) {
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("disconnect from " + addr);
}
ftp.client.logout();
ftp.client.disconnect();
}
} catch (Exception e) {
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("Error: ", e);
}
// for any un-foreseen exception (run time exception or not),
// do ultimate clean and leave ftp.client for garbage collection
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("delete client due to exception");
}
ftp.client = null;
// or do explicit garbage collection?
// System.gc();
// can we be less dramatic, using the following instead?
// probably unnecessary for our practical purpose here
// try {
// ftp.client.logout();
// ftp.client.disconnect();
// }
throw new FtpException(e);
// throw e;
}
}
// get ftp file as http response
private void getFileAsHttpResponse(String path, long lastModified)
throws IOException {
ByteArrayOutputStream os = null;
List<FTPFile> list = null;
try {
// first get its possible attributes
list = new LinkedList<FTPFile>();
ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
FTPFile ftpFile = (FTPFile) list.get(0);
this.headers.set(Response.CONTENT_LENGTH,
Long.valueOf(ftpFile.getSize()).toString());
this.headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(ftpFile.getTimestamp()));
// don't retrieve the file if not changed.
if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
code = 304;
return;
}
os = new ByteArrayOutputStream(ftp.getBufferSize());
ftp.client.retrieveFile(path, os, ftp.maxContentLength);
this.content = os.toByteArray();
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
// this.httpAccounting.incrementBytesSent(path.length());
// this.httpAccounting.incrementBytesRead(this.content.length);
// }
this.code = 200; // http OK
} catch (FtpExceptionControlClosedByForcedDataClose e) {
// control connection is off, clean up
// ftp.client.disconnect();
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("delete client because server cut off control channel: "
+ e);
}
ftp.client = null;
// in case this FtpExceptionControlClosedByForcedDataClose is
// thrown by retrieveList() (not retrieveFile()) above,
if (os == null) { // indicating throwing by retrieveList()
// throw new FtpException("fail to get attibutes: "+path);
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG
.warn("Please try larger maxContentLength for ftp.client.retrieveList(). "
+ e);
}
// in a way, this is our request fault
this.code = 400; // http Bad request
return;
}
FTPFile ftpFile = (FTPFile) list.get(0);
this.headers.set(Response.CONTENT_LENGTH,
Long.valueOf(ftpFile.getSize()).toString());
// this.headers.put("content-type", "text/html");
this.headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(ftpFile.getTimestamp()));
this.content = os.toByteArray();
if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
code = 304;
return;
}
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
// this.httpAccounting.incrementBytesSent(path.length());
// this.httpAccounting.incrementBytesRead(this.content.length);
// }
this.code = 200; // http OK
} catch (FtpExceptionCanNotHaveDataConnection e) {
if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
// it is not a file, but dir, so redirect as a dir
this.headers.set(Response.LOCATION, path + "/");
this.code = 300; // http redirect
// fixme, should we do ftp.client.cwd("/"), back to top dir?
} else {
// it is not a dir either
this.code = 404; // http Not Found
}
} catch (FtpExceptionUnknownForcedDataClose e) {
// Please note control channel is still live.
// in a way, this is our request fault
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+ "If this is acceptable, please modify Client.java accordingly. "
+ e);
}
this.code = 400; // http Bad Request
}
}
// get ftp dir list as http response
private void getDirAsHttpResponse(String path, long lastModified)
throws IOException {
List<FTPFile> list = new LinkedList<FTPFile>();
try {
// change to that dir first
if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
this.code = 404; // http Not Found
return;
}
// fixme, should we do ftp.client.cwd("/"), back to top dir?
ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
this.content = list2html(list, path, "/".equals(path) ? false : true);
this.headers.set(Response.CONTENT_LENGTH,
Integer.valueOf(this.content.length).toString());
this.headers.set(Response.CONTENT_TYPE, "text/html");
// this.headers.put("Last-Modified", null);
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
// this.httpAccounting.incrementBytesSent(path.length());
// this.httpAccounting.incrementBytesRead(this.content.length);
// }
this.code = 200; // http OK
} catch (FtpExceptionControlClosedByForcedDataClose e) {
// control connection is off, clean up
// ftp.client.disconnect();
if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
Ftp.LOG.info("delete client because server cut off control channel: "
+ e);
}
ftp.client = null;
this.content = list2html(list, path, "/".equals(path) ? false : true);
this.headers.set(Response.CONTENT_LENGTH,
Integer.valueOf(this.content.length).toString());
this.headers.set(Response.CONTENT_TYPE, "text/html");
// this.headers.put("Last-Modified", null);
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
// this.httpAccounting.incrementBytesSent(path.length());
// this.httpAccounting.incrementBytesRead(this.content.length);
// }
this.code = 200; // http OK
} catch (FtpExceptionUnknownForcedDataClose e) {
// Please note control channel is still live.
// in a way, this is our request fault
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+ "If this is acceptable, please modify Client.java accordingly. "
+ e);
}
this.code = 400; // http Bad Request
} catch (FtpExceptionCanNotHaveDataConnection e) {
if (Ftp.LOG.isWarnEnabled()) {
Ftp.LOG.warn("" + e);
}
this.code = 500; // http Iternal Server Error
}
}
// generate html page from ftp dir list
private byte[] list2html(List<FTPFile> list, String path,
boolean includeDotDot) {
// StringBuffer x = new
// StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
StringBuffer x = new StringBuffer("<html><head>");
x.append("<title>Index of " + path + "</title></head>\n");
x.append("<body><h1>Index of " + path + "</h1><pre>\n");
if (includeDotDot) {
x.append("<a href='../'>../</a>\t-\t-\t-\n");
}
for (int i = 0; i < list.size(); i++) {
FTPFile f = (FTPFile) list.get(i);
String name = f.getName();
String time = HttpDateFormat.toString(f.getTimestamp());
if (f.isDirectory()) {
// some ftp server LIST "." and "..", we skip them here
if (name.equals(".") || name.equals(".."))
continue;
x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
x.append(time + "\t-\n");
} else if (f.isFile()) {
x.append("<a href='" + name + "'>" + name + "</a>\t");
x.append(time + "\t" + f.getSize() + "\n");
} else {
// ignore isSymbolicLink()
// ignore isUnknown()
}
}
x.append("</pre></body></html>\n");
return new String(x).getBytes();
}
}