src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.protocol.httpclient;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;

 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpVersion;
 import org.apache.commons.httpclient.cookie.CookiePolicy;
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.params.HttpMethodParams;
 import org.apache.commons.httpclient.HttpException;
 import org.apache.commons.httpclient.HttpClient;


 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.hadoop.io.Text;

 /**
  * An HTTP response.
  *
  * @author Susam Pal
  */
 public class HttpResponse implements Response {

   private URL url;
   private byte[] content;
   private int code;
   private Metadata headers = new SpellCheckedMetadata();

   /**
    * Fetches the given <code>url</code> and prepares HTTP response.
    *
    * @param http
    *          An instance of the implementation class of this plugin
    * @param url
    *          URL to be fetched
    * @param datum
    *          Crawl data
    * @param followRedirects
    *          Whether to follow redirects; follows redirect if and only if this
    *          is true
    * @return HTTP response
    * @throws IOException
    *           When an error occurs
    */
   HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects)
       throws IOException {

     // Prepare GET method for HTTP request
     this.url = url;
     GetMethod get = new GetMethod(url.toString());
     get.setFollowRedirects(followRedirects);
     get.setDoAuthentication(true);
     if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
       get.setRequestHeader("If-Modified-Since",
           HttpDateFormat.toString(datum.getModifiedTime()));
     }

     // Set HTTP parameters
     HttpMethodParams params = get.getParams();
     if (http.getUseHttp11()) {
       params.setVersion(HttpVersion.HTTP_1_1);
     } else {
       params.setVersion(HttpVersion.HTTP_1_0);
     }
     params.makeLenient();
     params.setContentCharset("UTF-8");

     if (http.isCookieEnabled()) {
       params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
       params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
     } else {
       params.setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
     }
     // XXX (ab) not sure about this... the default is to retry 3 times; if
     // XXX the request body was sent the method is not retried, so there is
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);

     if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
       String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
       get.addRequestHeader("Cookie", cookie);
     }

     try {
       HttpClient client = Http.getClient();
       client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
       code = client.executeMethod(get);

       Header[] heads = get.getResponseHeaders();

       for (int i = 0; i < heads.length; i++) {
         headers.set(heads[i].getName(), heads[i].getValue());
       }

       // Limit download size
       int contentLength = Integer.MAX_VALUE;
       String contentLengthString = headers.get(Response.CONTENT_LENGTH);
       if (contentLengthString != null) {
         try {
           contentLength = Integer.parseInt(contentLengthString.trim());
         } catch (NumberFormatException ex) {
           throw new HttpException("bad content length: " + contentLengthString);
         }
       }
       if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
         contentLength = http.getMaxContent();
       }

       // always read content. Sometimes content is useful to find a cause
       // for error.
       InputStream in = get.getResponseBodyAsStream();
       try {
         byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
         int bufferFilled = 0;
         int totalRead = 0;
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
             && totalRead + bufferFilled <= contentLength) {
           totalRead += bufferFilled;
           out.write(buffer, 0, bufferFilled);
         }

         content = out.toByteArray();
       } catch (Exception e) {
         if (code == 200)
           throw new IOException(e.toString());
         // for codes other than 200 OK, we are fine with empty content
       } finally {
         if (in != null) {
           in.close();
         }
         get.abort();
       }

       StringBuilder fetchTrace = null;
       if (Http.LOG.isTraceEnabled()) {
         // Trace message
         fetchTrace = new StringBuilder("url: " + url + "; status code: " + code
             + "; bytes received: " + content.length);
         if (getHeader(Response.CONTENT_LENGTH) != null)
           fetchTrace.append("; Content-Length: "
               + getHeader(Response.CONTENT_LENGTH));
         if (getHeader(Response.LOCATION) != null)
           fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
       }
       // Extract gzip, x-gzip and deflate content
       if (content != null) {
         // check if we have to uncompress it
         String contentEncoding = headers.get(Response.CONTENT_ENCODING);
         if (contentEncoding != null && Http.LOG.isTraceEnabled())
           fetchTrace.append("; Content-Encoding: " + contentEncoding);
         if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
           content = http.processGzipEncoded(content, url);
           if (Http.LOG.isTraceEnabled())
             fetchTrace.append("; extracted to " + content.length + " bytes");
         } else if ("deflate".equals(contentEncoding)) {
           content = http.processDeflateEncoded(content, url);
           if (Http.LOG.isTraceEnabled())
             fetchTrace.append("; extracted to " + content.length + " bytes");
         }
       }

       // Logger trace message
       if (Http.LOG.isTraceEnabled()) {
         Http.LOG.trace(fetchTrace.toString());
       }
     } finally {
       get.releaseConnection();
     }
   }

   /*
    * ------------------------- * <implementation:Response> *
    * -------------------------
    */

   public URL getUrl() {
     return url;
   }

   public int getCode() {
     return code;
   }

   public String getHeader(String name) {
     return headers.get(name);
   }

   public Metadata getHeaders() {
     return headers;
   }

   public byte[] getContent() {
     return content;
   }

   /*
    * -------------------------- * </implementation:Response> *
    * --------------------------
    */
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.protocol.httpclient;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URL;

	import org.apache.commons.httpclient.Header;
	import org.apache.commons.httpclient.HttpVersion;
	import org.apache.commons.httpclient.cookie.CookiePolicy;
	import org.apache.commons.httpclient.methods.GetMethod;
	import org.apache.commons.httpclient.params.HttpMethodParams;
	import org.apache.commons.httpclient.HttpException;
	import org.apache.commons.httpclient.HttpClient;


	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.metadata.SpellCheckedMetadata;
	import org.apache.nutch.net.protocols.HttpDateFormat;
	import org.apache.nutch.net.protocols.Response;
	import org.apache.nutch.protocol.http.api.HttpBase;
	import org.apache.hadoop.io.Text;

	/**
	* An HTTP response.
	*
	* @author Susam Pal
	*/
	public class HttpResponse implements Response {

	private URL url;
	private byte[] content;
	private int code;
	private Metadata headers = new SpellCheckedMetadata();

	/**
	* Fetches the given <code>url</code> and prepares HTTP response.
	*
	* @param http
	* An instance of the implementation class of this plugin
	* @param url
	* URL to be fetched
	* @param datum
	* Crawl data
	* @param followRedirects
	* Whether to follow redirects; follows redirect if and only if this
	* is true
	* @return HTTP response
	* @throws IOException
	* When an error occurs
	*/
	HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects)
	throws IOException {

	// Prepare GET method for HTTP request
	this.url = url;
	GetMethod get = new GetMethod(url.toString());
	get.setFollowRedirects(followRedirects);
	get.setDoAuthentication(true);
	if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
	get.setRequestHeader("If-Modified-Since",
	HttpDateFormat.toString(datum.getModifiedTime()));
	}

	// Set HTTP parameters
	HttpMethodParams params = get.getParams();
	if (http.getUseHttp11()) {
	params.setVersion(HttpVersion.HTTP_1_1);
	} else {
	params.setVersion(HttpVersion.HTTP_1_0);
	}
	params.makeLenient();
	params.setContentCharset("UTF-8");

	if (http.isCookieEnabled()) {
	params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
	params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
	} else {
	params.setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
	}
	// XXX (ab) not sure about this... the default is to retry 3 times; if
	// XXX the request body was sent the method is not retried, so there is
	// XXX little danger in retrying...
	// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);

	if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
	String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
	get.addRequestHeader("Cookie", cookie);
	}

	try {
	HttpClient client = Http.getClient();
	client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
	code = client.executeMethod(get);

	Header[] heads = get.getResponseHeaders();

	for (int i = 0; i < heads.length; i++) {
	headers.set(heads[i].getName(), heads[i].getValue());
	}

	// Limit download size
	int contentLength = Integer.MAX_VALUE;
	String contentLengthString = headers.get(Response.CONTENT_LENGTH);
	if (contentLengthString != null) {
	try {
	contentLength = Integer.parseInt(contentLengthString.trim());
	} catch (NumberFormatException ex) {
	throw new HttpException("bad content length: " + contentLengthString);
	}
	}
	if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
	contentLength = http.getMaxContent();
	}

	// always read content. Sometimes content is useful to find a cause
	// for error.
	InputStream in = get.getResponseBodyAsStream();
	try {
	byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
	int bufferFilled = 0;
	int totalRead = 0;
	ByteArrayOutputStream out = new ByteArrayOutputStream();
	while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
	&& totalRead + bufferFilled <= contentLength) {
	totalRead += bufferFilled;
	out.write(buffer, 0, bufferFilled);
	}

	content = out.toByteArray();
	} catch (Exception e) {
	if (code == 200)
	throw new IOException(e.toString());
	// for codes other than 200 OK, we are fine with empty content
	} finally {
	if (in != null) {
	in.close();
	}
	get.abort();
	}

	StringBuilder fetchTrace = null;
	if (Http.LOG.isTraceEnabled()) {
	// Trace message
	fetchTrace = new StringBuilder("url: " + url + "; status code: " + code
	+ "; bytes received: " + content.length);
	if (getHeader(Response.CONTENT_LENGTH) != null)
	fetchTrace.append("; Content-Length: "
	+ getHeader(Response.CONTENT_LENGTH));
	if (getHeader(Response.LOCATION) != null)
	fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
	}
	// Extract gzip, x-gzip and deflate content
	if (content != null) {
	// check if we have to uncompress it
	String contentEncoding = headers.get(Response.CONTENT_ENCODING);
	if (contentEncoding != null && Http.LOG.isTraceEnabled())
	fetchTrace.append("; Content-Encoding: " + contentEncoding);
	if ("gzip".equals(contentEncoding) \|\| "x-gzip".equals(contentEncoding)) {
	content = http.processGzipEncoded(content, url);
	if (Http.LOG.isTraceEnabled())
	fetchTrace.append("; extracted to " + content.length + " bytes");
	} else if ("deflate".equals(contentEncoding)) {
	content = http.processDeflateEncoded(content, url);
	if (Http.LOG.isTraceEnabled())
	fetchTrace.append("; extracted to " + content.length + " bytes");
	}
	}

	// Logger trace message
	if (Http.LOG.isTraceEnabled()) {
	Http.LOG.trace(fetchTrace.toString());
	}
	} finally {
	get.releaseConnection();
	}
	}

	/*
	* ------------------------- * <implementation:Response> *
	* -------------------------
	*/

	public URL getUrl() {
	return url;
	}

	public int getCode() {
	return code;
	}

	public String getHeader(String name) {
	return headers.get(name);
	}

	public Metadata getHeaders() {
	return headers;
	}

	public byte[] getContent() {
	return content;
	}

	/*
	* -------------------------- * </implementation:Response> *
	* --------------------------
	*/
	}