blob: 05b9e2a6866f7c018c4cab1d3e1e875366d8408a [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.httpclient;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpVersion;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.hadoop.io.Text;
/**
* An HTTP response.
*
* @author Susam Pal
*/
public class HttpResponse implements Response {
private URL url;
private byte[] content;
private int code;
private Metadata headers = new SpellCheckedMetadata();
/**
* Fetches the given <code>url</code> and prepares HTTP response.
*
* @param http
* An instance of the implementation class of this plugin
* @param url
* URL to be fetched
* @param datum
* Crawl data
* @param followRedirects
* Whether to follow redirects; follows redirect if and only if this
* is true
* @return HTTP response
* @throws IOException
* When an error occurs
*/
HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects)
throws IOException {
// Prepare GET method for HTTP request
this.url = url;
GetMethod get = new GetMethod(url.toString());
get.setFollowRedirects(followRedirects);
get.setDoAuthentication(true);
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
get.setRequestHeader("If-Modified-Since",
HttpDateFormat.toString(datum.getModifiedTime()));
}
// Set HTTP parameters
HttpMethodParams params = get.getParams();
if (http.getUseHttp11()) {
params.setVersion(HttpVersion.HTTP_1_1);
} else {
params.setVersion(HttpVersion.HTTP_1_0);
}
params.makeLenient();
params.setContentCharset("UTF-8");
if (http.isCookieEnabled()) {
params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
} else {
params.setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
}
// XXX (ab) not sure about this... the default is to retry 3 times; if
// XXX the request body was sent the method is not retried, so there is
// XXX little danger in retrying...
// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
get.addRequestHeader("Cookie", cookie);
}
try {
HttpClient client = Http.getClient();
client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
code = client.executeMethod(get);
Header[] heads = get.getResponseHeaders();
for (int i = 0; i < heads.length; i++) {
headers.set(heads[i].getName(), heads[i].getValue());
}
// Limit download size
int contentLength = Integer.MAX_VALUE;
String contentLengthString = headers.get(Response.CONTENT_LENGTH);
if (contentLengthString != null) {
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
throw new HttpException("bad content length: " + contentLengthString);
}
}
if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
// always read content. Sometimes content is useful to find a cause
// for error.
InputStream in = get.getResponseBodyAsStream();
try {
byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
int bufferFilled = 0;
int totalRead = 0;
ByteArrayOutputStream out = new ByteArrayOutputStream();
while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
&& totalRead + bufferFilled <= contentLength) {
totalRead += bufferFilled;
out.write(buffer, 0, bufferFilled);
}
content = out.toByteArray();
} catch (Exception e) {
if (code == 200)
throw new IOException(e.toString());
// for codes other than 200 OK, we are fine with empty content
} finally {
if (in != null) {
in.close();
}
get.abort();
}
StringBuilder fetchTrace = null;
if (Http.LOG.isTraceEnabled()) {
// Trace message
fetchTrace = new StringBuilder("url: " + url + "; status code: " + code
+ "; bytes received: " + content.length);
if (getHeader(Response.CONTENT_LENGTH) != null)
fetchTrace.append("; Content-Length: "
+ getHeader(Response.CONTENT_LENGTH));
if (getHeader(Response.LOCATION) != null)
fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
}
// Extract gzip, x-gzip and deflate content
if (content != null) {
// check if we have to uncompress it
String contentEncoding = headers.get(Response.CONTENT_ENCODING);
if (contentEncoding != null && Http.LOG.isTraceEnabled())
fetchTrace.append("; Content-Encoding: " + contentEncoding);
if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
if (Http.LOG.isTraceEnabled())
fetchTrace.append("; extracted to " + content.length + " bytes");
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
if (Http.LOG.isTraceEnabled())
fetchTrace.append("; extracted to " + content.length + " bytes");
}
}
// Logger trace message
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace(fetchTrace.toString());
}
} finally {
get.releaseConnection();
}
}
/*
* ------------------------- * <implementation:Response> *
* -------------------------
*/
public URL getUrl() {
return url;
}
public int getCode() {
return code;
}
public String getHeader(String name) {
return headers.get(name);
}
public Metadata getHeaders() {
return headers;
}
public byte[] getContent() {
return content;
}
/*
* -------------------------- * </implementation:Response> *
* --------------------------
*/
}