| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.protocol.http; |
| |
| import java.io.BufferedInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.io.PushbackInputStream; |
| import java.net.InetSocketAddress; |
| import java.net.Socket; |
| import java.net.URL; |
| import java.nio.charset.StandardCharsets; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| import javax.net.ssl.SSLContext; |
| import javax.net.ssl.SSLSocket; |
| import javax.net.ssl.SSLSocketFactory; |
| import javax.net.ssl.TrustManager; |
| |
| import org.apache.hadoop.io.Text; |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.metadata.HttpHeaders; |
| import org.apache.nutch.metadata.Metadata; |
| import org.apache.nutch.metadata.SpellCheckedMetadata; |
| import org.apache.nutch.net.protocols.HttpDateFormat; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.nutch.protocol.ProtocolException; |
| import org.apache.nutch.protocol.http.api.HttpBase; |
| import org.apache.nutch.protocol.http.api.HttpException; |
| |
| /** |
| * An HTTP response. |
| */ |
| public class HttpResponse implements Response { |
| |
| private HttpBase http; |
| private URL url; |
| private byte[] content; |
| private int code; |
| private Metadata headers = new SpellCheckedMetadata(); |
| // used for storing the http headers verbatim |
| private StringBuffer httpHeaders; |
| |
| protected enum Scheme { |
| HTTP, HTTPS, |
| } |
| |
| /** |
| * Default public constructor. |
| * |
| * @param http |
| * @param url |
| * @param datum |
| * @throws ProtocolException |
| * @throws IOException |
| */ |
| public HttpResponse(HttpBase http, URL url, CrawlDatum datum) |
| throws ProtocolException, IOException { |
| |
| this.http = http; |
| this.url = url; |
| |
| Scheme scheme = null; |
| |
| if ("http".equals(url.getProtocol())) { |
| scheme = Scheme.HTTP; |
| } else if ("https".equals(url.getProtocol())) { |
| scheme = Scheme.HTTPS; |
| } else { |
| throw new HttpException("Unknown scheme (not http/https) for url:" + url); |
| } |
| |
| if (Http.LOG.isTraceEnabled()) { |
| Http.LOG.trace("fetching " + url); |
| } |
| |
| String path = url.getFile(); |
| if (!path.startsWith("/")) { |
| path = "/" + path; |
| } |
| |
| // some servers will redirect a request with a host line like |
| // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they |
| // don't want the :80... |
| |
| String host = url.getHost(); |
| int port; |
| String portString; |
| if (url.getPort() == -1) { |
| if (scheme == Scheme.HTTP) { |
| port = 80; |
| } else { |
| port = 443; |
| } |
| portString = ""; |
| } else { |
| port = url.getPort(); |
| portString = ":" + port; |
| } |
| Socket socket = null; |
| |
| try { |
| socket = new Socket(); // create the socket |
| socket.setSoTimeout(http.getTimeout()); |
| |
| // connect |
| String sockHost = http.useProxy(url) ? http.getProxyHost() : host; |
| int sockPort = http.useProxy(url) ? http.getProxyPort() : port; |
| InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); |
| socket.connect(sockAddr, http.getTimeout()); |
| |
| if (scheme == Scheme.HTTPS) { |
| SSLSocket sslsocket = null; |
| |
| try { |
| sslsocket = getSSLSocket(socket, sockHost, sockPort); |
| sslsocket.startHandshake(); |
| } catch (Exception e) { |
| Http.LOG.debug("SSL connection to {} failed with: {}", url, |
| e.getMessage()); |
| if ("handshake alert: unrecognized_name".equals(e.getMessage())) { |
| try { |
| // Reconnect, see NUTCH-2447 |
| socket = new Socket(); |
| socket.setSoTimeout(http.getTimeout()); |
| socket.connect(sockAddr, http.getTimeout()); |
| sslsocket = getSSLSocket(socket, "", sockPort); |
| sslsocket.startHandshake(); |
| } catch (Exception ex) { |
| String msg = "SSL reconnect to " + url + " failed with: " |
| + e.getMessage(); |
| throw new HttpException(msg); |
| } |
| } |
| } |
| socket = sslsocket; |
| } |
| |
| if (http.isStoreIPAddress()) { |
| headers.add("_ip_", sockAddr.getAddress().getHostAddress()); |
| } |
| |
| // make request |
| OutputStream req = socket.getOutputStream(); |
| |
| StringBuffer reqStr = new StringBuffer("GET "); |
| if (http.useProxy(url)) { |
| reqStr.append(url.getProtocol() + "://" + host + portString + path); |
| } else { |
| reqStr.append(path); |
| } |
| |
| if (http.getUseHttp11()) { |
| reqStr.append(" HTTP/1.1\r\n"); |
| } else { |
| reqStr.append(" HTTP/1.0\r\n"); |
| } |
| |
| reqStr.append("Host: "); |
| reqStr.append(host); |
| reqStr.append(portString); |
| reqStr.append("\r\n"); |
| |
| reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); |
| |
| String userAgent = http.getUserAgent(); |
| if ((userAgent == null) || (userAgent.length() == 0)) { |
| if (Http.LOG.isErrorEnabled()) { |
| Http.LOG.error("User-agent is not set!"); |
| } |
| } else { |
| reqStr.append("User-Agent: "); |
| reqStr.append(userAgent); |
| reqStr.append("\r\n"); |
| } |
| |
| String acceptLanguage = http.getAcceptLanguage(); |
| if (!acceptLanguage.isEmpty()) { |
| reqStr.append("Accept-Language: "); |
| reqStr.append(acceptLanguage); |
| reqStr.append("\r\n"); |
| } |
| |
| String acceptCharset = http.getAcceptCharset(); |
| if (!acceptCharset.isEmpty()) { |
| reqStr.append("Accept-Charset: "); |
| reqStr.append(acceptCharset); |
| reqStr.append("\r\n"); |
| } |
| |
| String accept = http.getAccept(); |
| if (!accept.isEmpty()) { |
| reqStr.append("Accept: "); |
| reqStr.append(accept); |
| reqStr.append("\r\n"); |
| } |
| |
| if (http.isCookieEnabled() |
| && datum.getMetaData().containsKey(HttpBase.COOKIE)) { |
| String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE)) |
| .toString(); |
| reqStr.append("Cookie: "); |
| reqStr.append(cookie); |
| reqStr.append("\r\n"); |
| } |
| |
| if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { |
| reqStr.append(HttpHeaders.IF_MODIFIED_SINCE + ": " |
| + HttpDateFormat.toString(datum.getModifiedTime())); |
| reqStr.append("\r\n"); |
| } |
| |
| // "signal that this connection will be closed after completion of the |
| // response", see https://tools.ietf.org/html/rfc7230#section-6.1 |
| reqStr.append("Connection: close\r\n"); |
| reqStr.append("\r\n"); |
| |
| // store the request in the metadata? |
| if (http.isStoreHttpRequest()) { |
| headers.add(Response.REQUEST, reqStr.toString()); |
| } |
| |
| byte[] reqBytes = reqStr.toString().getBytes(); |
| |
| req.write(reqBytes); |
| req.flush(); |
| |
| PushbackInputStream in = // process response |
| new PushbackInputStream( |
| new BufferedInputStream(socket.getInputStream(), |
| Http.BUFFER_SIZE), Http.BUFFER_SIZE); |
| |
| StringBuffer line = new StringBuffer(); |
| StringBuffer lineSeparator = new StringBuffer(); |
| |
| // store the http headers verbatim |
| if (http.isStoreHttpHeaders()) { |
| httpHeaders = new StringBuffer(); |
| } |
| |
| headers.add(FETCH_TIME, Long.toString(System.currentTimeMillis())); |
| |
| boolean haveSeenNonContinueStatus = false; |
| while (!haveSeenNonContinueStatus) { |
| // parse status code line |
| try { |
| this.code = parseStatusLine(in, line, lineSeparator); |
| } catch(HttpException e) { |
| Http.LOG.warn("Missing or invalid HTTP status line", e); |
| Http.LOG.warn("No HTTP header, assuming HTTP/0.9 for {}", getUrl()); |
| this.code = 200; |
| in.unread(lineSeparator.toString().getBytes(StandardCharsets.ISO_8859_1)); |
| in.unread(line.toString().getBytes(StandardCharsets.ISO_8859_1)); |
| break; |
| } |
| if (httpHeaders != null) |
| httpHeaders.append(line).append("\r\n"); |
| // parse headers |
| parseHeaders(in, line, httpHeaders); |
| haveSeenNonContinueStatus = code != 100; // 100 is "Continue" |
| } |
| |
| try { |
| String transferEncoding = getHeader(Response.TRANSFER_ENCODING); |
| if (transferEncoding != null |
| && "chunked".equalsIgnoreCase(transferEncoding.trim())) { |
| readChunkedContent(in, line); |
| } else { |
| readPlainContent(in); |
| } |
| |
| String contentEncoding = getHeader(Response.CONTENT_ENCODING); |
| if ("gzip".equals(contentEncoding) |
| || "x-gzip".equals(contentEncoding)) { |
| content = http.processGzipEncoded(content, url); |
| } else if ("deflate".equals(contentEncoding)) { |
| content = http.processDeflateEncoded(content, url); |
| } else { |
| // store the headers verbatim only if the response was not compressed |
| // as the content length reported does not match otherwise |
| if (httpHeaders != null) { |
| httpHeaders.append("\r\n"); |
| headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); |
| } |
| if (Http.LOG.isTraceEnabled()) { |
| Http.LOG.trace("fetched " + content.length + " bytes from " + url); |
| } |
| } |
| } catch (IOException | HttpException e) { |
| // Headers parsing went fine, but an error occurred while trying to read |
| // the body of the request (the body may be malformed) |
| if (code != 200) { |
| Http.LOG.warn( |
| "Ignored exception while reading payload of response with status code " |
| + code + ":", |
| e); |
| content = null; |
| } else { |
| // If the page is a "200 OK" response, we do not want to go further |
| // with processing the invalid payload. |
| throw e; |
| } |
| } |
| } finally { |
| if (socket != null) |
| socket.close(); |
| } |
| |
| } |
| |
| /* |
| * ------------------------- * <implementation:Response> * |
| * ------------------------- |
| */ |
| |
| public URL getUrl() { |
| return url; |
| } |
| |
| public int getCode() { |
| return code; |
| } |
| |
| public String getHeader(String name) { |
| return headers.get(name); |
| } |
| |
| public Metadata getHeaders() { |
| return headers; |
| } |
| |
| public byte[] getContent() { |
| return content; |
| } |
| |
| /* |
| * ------------------------- * <implementation:Response> * |
| * ------------------------- |
| */ |
| |
| private SSLSocket getSSLSocket(Socket socket, String sockHost, int sockPort) |
| throws Exception { |
| SSLSocketFactory factory; |
| if (http.isTlsCheckCertificates()) { |
| factory = (SSLSocketFactory) SSLSocketFactory.getDefault(); |
| } else { |
| SSLContext sslContext = SSLContext.getInstance("TLS"); |
| sslContext.init(null, |
| new TrustManager[] { new DummyX509TrustManager(null) }, null); |
| factory = sslContext.getSocketFactory(); |
| } |
| |
| SSLSocket sslsocket = (SSLSocket) factory |
| .createSocket(socket, sockHost, sockPort, true); |
| sslsocket.setUseClientMode(true); |
| |
| // Get the protocols and ciphers supported by this JVM |
| Set<String> protocols = new HashSet<String>( |
| Arrays.asList(sslsocket.getSupportedProtocols())); |
| Set<String> ciphers = new HashSet<String>( |
| Arrays.asList(sslsocket.getSupportedCipherSuites())); |
| |
| // Intersect with preferred protocols and ciphers |
| protocols.retainAll(http.getTlsPreferredProtocols()); |
| ciphers.retainAll(http.getTlsPreferredCipherSuites()); |
| |
| sslsocket.setEnabledProtocols( |
| protocols.toArray(new String[protocols.size()])); |
| sslsocket.setEnabledCipherSuites( |
| ciphers.toArray(new String[ciphers.size()])); |
| |
| return sslsocket; |
| } |
| |
| private void readPlainContent(InputStream in) |
| throws HttpException, IOException { |
| |
| int contentLength = Integer.MAX_VALUE; // get content length |
| String contentLengthString = headers.get(Response.CONTENT_LENGTH); |
| if (contentLengthString != null) { |
| contentLengthString = contentLengthString.trim(); |
| try { |
| if (!contentLengthString.isEmpty()) { |
| contentLength = Integer.parseInt(contentLengthString); |
| } |
| } catch (NumberFormatException e) { |
| Http.LOG.warn("bad content length: {}", contentLengthString); |
| } |
| } |
| if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { |
| // limit the download size |
| contentLength = http.getMaxContent(); |
| } |
| |
| ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); |
| byte[] bytes = new byte[Http.BUFFER_SIZE]; |
| int length = 0; |
| |
| // do not try to read if the contentLength is 0 |
| if (contentLength == 0) { |
| content = new byte[0]; |
| return; |
| } |
| |
| // read content |
| int i = in.read(bytes); |
| while (i != -1) { |
| out.write(bytes, 0, i); |
| length += i; |
| if (length >= contentLength) { |
| break; |
| } |
| if ((length + Http.BUFFER_SIZE) > contentLength) { |
| // reading next chunk may hit contentLength, |
| // must limit number of bytes read |
| i = in.read(bytes, 0, (contentLength - length)); |
| } else { |
| i = in.read(bytes); |
| } |
| } |
| content = out.toByteArray(); |
| } |
| |
| /** |
| * @param in |
| * @param line |
| * @throws HttpException |
| * @throws IOException |
| */ |
| private void readChunkedContent(PushbackInputStream in, StringBuffer line) |
| throws HttpException, IOException { |
| boolean doneChunks = false; |
| int contentBytesRead = 0; |
| byte[] bytes = new byte[Http.BUFFER_SIZE]; |
| ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); |
| |
| while (true) { |
| if (Http.LOG.isTraceEnabled()) { |
| Http.LOG.trace("Http: starting chunk"); |
| } |
| |
| readLine(in, line, false); |
| |
| String chunkLenStr; |
| // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); |
| // } |
| |
| int pos = line.indexOf(";"); |
| if (pos < 0) { |
| chunkLenStr = line.toString(); |
| } else { |
| chunkLenStr = line.substring(0, pos); |
| // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + |
| // line.substring(pos+1)); } |
| } |
| chunkLenStr = chunkLenStr.trim(); |
| int chunkLen; |
| try { |
| chunkLen = Integer.parseInt(chunkLenStr, 16); |
| } catch (NumberFormatException e) { |
| throw new HttpException("bad chunk length: " + line.toString()); |
| } |
| |
| if (chunkLen == 0) { |
| doneChunks = true; |
| break; |
| } |
| |
| if (http.getMaxContent() >= 0 |
| && (contentBytesRead + chunkLen) > http.getMaxContent()) { |
| // content will be trimmed when processing this chunk |
| chunkLen = http.getMaxContent() - contentBytesRead; |
| } |
| |
| // read one chunk |
| int chunkBytesRead = 0; |
| while (chunkBytesRead < chunkLen) { |
| |
| int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? |
| (chunkLen - chunkBytesRead) : |
| Http.BUFFER_SIZE; |
| int len = in.read(bytes, 0, toRead); |
| |
| if (len == -1) |
| throw new HttpException("chunk eof after " + contentBytesRead |
| + " bytes in successful chunks" + " and " + chunkBytesRead |
| + " in current chunk"); |
| |
| // DANGER!!! Will printed GZIPed stuff right to your |
| // terminal! |
| // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, |
| // len)); } |
| |
| out.write(bytes, 0, len); |
| chunkBytesRead += len; |
| } |
| |
| contentBytesRead += chunkBytesRead; |
| if (http.getMaxContent() >= 0 |
| && contentBytesRead >= http.getMaxContent()) { |
| Http.LOG.trace("Http: content limit reached"); |
| break; |
| } |
| |
| readLine(in, line, false); |
| |
| } |
| |
| content = out.toByteArray(); |
| |
| if (!doneChunks) { |
| // content trimmed |
| if (contentBytesRead != http.getMaxContent()) |
| throw new HttpException("chunk eof: !doneChunk && didn't max out"); |
| return; |
| } |
| |
| // read trailing headers |
| parseHeaders(in, line, null); |
| |
| } |
| |
| private int parseStatusLine(PushbackInputStream in, StringBuffer line, |
| StringBuffer lineSeparator) throws IOException, HttpException { |
| readLine(in, line, false, 2048, lineSeparator); |
| |
| int codeStart = line.indexOf(" "); |
| int codeEnd; |
| int lineLength = line.length(); |
| |
| // We want to handle lines like "HTTP/1.1 200", "HTTP/1.1 200 OK", or "HTTP/1.1 404: Not Found" |
| for (codeEnd = codeStart + 1; codeEnd < lineLength; codeEnd++) { |
| if (!Character.isDigit(line.charAt(codeEnd))) break; |
| // Note: input is plain ASCII and may not contain Arabic etc. digits |
| // covered by Character.isDigit() |
| } |
| |
| try { |
| return Integer.parseInt(line.substring(codeStart + 1, codeEnd)); |
| } catch (NumberFormatException e) { |
| throw new HttpException("Bad status line, no HTTP response code: " + line, e); |
| } |
| } |
| |
| private void processHeaderLine(StringBuffer line) { |
| |
| int colonIndex = line.indexOf(":"); // key is up to colon |
| if (colonIndex == -1) { |
| Http.LOG.info("Ignoring a header line without a colon: '{}'", line); |
| return; |
| } |
| String key = line.substring(0, colonIndex); |
| |
| int valueStart = colonIndex + 1; // skip whitespace |
| while (valueStart < line.length()) { |
| int c = line.charAt(valueStart); |
| if (c != ' ' && c != '\t') |
| break; |
| valueStart++; |
| } |
| String value = line.substring(valueStart); |
| headers.set(key, value); |
| } |
| |
| // Adds headers to our headers Metadata |
| private void parseHeaders(PushbackInputStream in, StringBuffer line, |
| StringBuffer httpHeaders) throws IOException, HttpException { |
| |
| while (readLine(in, line, true) != 0) { |
| |
| if (httpHeaders != null) |
| httpHeaders.append(line).append("\r\n"); |
| |
| // handle HTTP responses with missing blank line after headers |
| int pos; |
| if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ( |
| (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html")) |
| != -1)) { |
| |
| in.unread(line.substring(pos).getBytes(StandardCharsets.ISO_8859_1)); |
| line.setLength(pos); |
| |
| try { |
| // TODO: (CM) We don't know the header names here |
| // since we're just handling them generically. It would |
| // be nice to provide some sort of mapping function here |
| // for the returned header names to the standard metadata |
| // names in the ParseData class |
| processHeaderLine(line); |
| } catch (Exception e) { |
| // fixme: |
| Http.LOG.warn("Error: ", e); |
| } |
| return; |
| } |
| |
| processHeaderLine(line); |
| } |
| } |
| |
| private static int readLine(PushbackInputStream in, StringBuffer line, |
| boolean allowContinuedLine) throws IOException { |
| return readLine(in, line, allowContinuedLine, Http.BUFFER_SIZE, null); |
| } |
| |
| private static int readLine(PushbackInputStream in, StringBuffer line, |
| boolean allowContinuedLine, int maxBytes, StringBuffer lineSeparator) throws IOException { |
| line.setLength(0); |
| int bytesRead = 0; |
| for (int c = in.read(); c != -1 |
| && bytesRead < maxBytes; c = in.read(), bytesRead++) { |
| switch (c) { |
| case '\r': |
| if (lineSeparator != null) { |
| lineSeparator.append((char) c); |
| } |
| if (peek(in) == '\n') { |
| in.read(); |
| if (lineSeparator != null) { |
| lineSeparator.append((char) c); |
| } |
| } |
| // fall-through |
| case '\n': |
| if (lineSeparator != null) { |
| lineSeparator.append((char) c); |
| } |
| if (line.length() > 0) { |
| // at EOL -- check for continued line if the current |
| // (possibly continued) line wasn't blank |
| if (allowContinuedLine) |
| switch (peek(in)) { |
| case ' ': |
| case '\t': // line is continued |
| in.read(); |
| if (lineSeparator != null) { |
| lineSeparator.replace(0, lineSeparator.length(), ""); |
| } |
| continue; |
| } |
| } |
| return line.length(); // else complete |
| default: |
| line.append((char) c); |
| } |
| } |
| if (bytesRead >= maxBytes) { |
| throw new IOException("Line exceeds max. buffer size: " |
| + line.substring(0, Math.min(32, line.length()))); |
| } |
| return line.length(); |
| } |
| |
| private static int peek(PushbackInputStream in) throws IOException { |
| int value = in.read(); |
| in.unread(value); |
| return value; |
| } |
| } |