| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /* |
| * The MIT License (MIT) |
| * Copyright (c) 2014 Martin Kleppmann |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| * THE SOFTWARE. |
| */ |
| |
| package org.apache.hadoop.hbase.test.util.warc; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.DataInput; |
| import java.io.DataOutput; |
| import java.io.IOException; |
| import java.util.LinkedHashMap; |
| import java.util.Map; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} |
| * by parsing it out of a {@link DataInput} stream. |
| * |
| * The file format is documented in the |
| * [ISO Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). |
| * In a nutshell, it's a textual format consisting of lines delimited by `\r\n`. |
| * Each record has the following structure: |
| * |
| * 1. A line indicating the WARC version number, such as `WARC/1.0`. |
| * 2. Several header lines (in key-value format, similar to HTTP or email headers), |
| * giving information about the record. The header is terminated by an empty line. |
| * 3. A body consisting of raw bytes (the number of bytes is indicated in one of the headers). |
| * 4. A final separator of `\r\n\r\n` before the next record starts. |
| * |
| * There are various different types of records, as documented on |
| * {@link Header#getRecordType()}. |
| */ |
| public class WARCRecord { |
| |
| public static final String WARC_VERSION = "WARC/1.0"; |
| private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+"); |
| private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*"); |
| private static final String CRLF = "\r\n"; |
| private static final byte[] CRLF_BYTES = { 13, 10 }; |
| |
| private final Header header; |
| private final byte[] content; |
| |
| /** |
| * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream. |
| * @param in The input source from which one record will be read. |
| */ |
| public WARCRecord(DataInput in) throws IOException { |
| header = readHeader(in); |
| content = new byte[header.getContentLength()]; |
| in.readFully(content); |
| readSeparator(in); |
| } |
| |
| private static Header readHeader(DataInput in) throws IOException { |
| String versionLine = readLine(in); |
| if (!VERSION_PATTERN.matcher(versionLine).matches()) { |
| throw new IllegalStateException("Expected WARC version, but got: " + versionLine); |
| } |
| |
| LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>(); |
| String line, fieldName = null; |
| |
| do { |
| line = readLine(in); |
| if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) { |
| headers.put(fieldName, headers.get(fieldName) + line); |
| } else if (!line.isEmpty()) { |
| String[] field = line.split(":", 2); |
| if (field.length < 2) { |
| throw new IllegalStateException("Malformed header line: " + line); |
| } |
| fieldName = field[0].trim(); |
| headers.put(fieldName, field[1].trim()); |
| } |
| } while (!line.isEmpty()); |
| |
| return new Header(headers); |
| } |
| |
| private static String readLine(DataInput in) throws IOException { |
| ByteArrayOutputStream out = new ByteArrayOutputStream(); |
| boolean seenCR = false, seenCRLF = false; |
| while (!seenCRLF) { |
| byte b = in.readByte(); |
| if (!seenCR && b == 13) { |
| seenCR = true; |
| } else if (seenCR && b == 10) { |
| seenCRLF = true; |
| } else { |
| seenCR = false; |
| out.write(b); |
| } |
| } |
| return out.toString("UTF-8"); |
| } |
| |
| private static void readSeparator(DataInput in) throws IOException { |
| byte[] sep = new byte[4]; |
| in.readFully(sep); |
| if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) { |
| throw new IllegalStateException(String.format( |
| "Expected final separator CR LF CR LF, but got: %d %d %d %d", |
| sep[0], sep[1], sep[2], sep[3])); |
| } |
| } |
| |
| /** |
| * Returns the parsed header structure of the WARC record. |
| */ |
| public Header getHeader() { |
| return header; |
| } |
| |
| /** |
| * Returns the body of the record, as an unparsed raw array of bytes. The content |
| * of the body depends on the type of record (see {@link Header#getRecordType()}). |
| * For example, in the case of a `response` type header, the body consists of the |
| * full HTTP response returned by the server (HTTP headers followed by the body). |
| */ |
| public byte[] getContent() { |
| return content; |
| } |
| |
| /** |
| * Writes this record to a {@link DataOutput} stream. The output may, in some edge |
| * cases, be not byte-for-byte identical to what was parsed from a {@link DataInput}. |
| * However it has the same meaning and should not lose any information. |
| * @param out The output stream to which this record should be appended. |
| */ |
| public void write(DataOutput out) throws IOException { |
| header.write(out); |
| out.write(CRLF_BYTES); |
| out.write(content); |
| out.write(CRLF_BYTES); |
| out.write(CRLF_BYTES); |
| } |
| |
| /** |
| * Returns a human-readable string representation of the record. |
| */ |
| @Override |
| public String toString() { |
| return header.toString(); |
| } |
| |
| /** |
| * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number |
| * of headers in key-value format, where some header keys are standardised, but |
| * nonstandard ones can be added. |
| * |
| * The documentation of the methods in this class is excerpted from the |
| * [WARC 1.0 specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf). |
| * Please see the specification for more detail. |
| */ |
| public final static class Header { |
| private final Map<String, String> fields; |
| |
| private Header(Map<String, String> fields) { |
| this.fields = fields; |
| } |
| |
| /** |
| * Returns the type of WARC record (the value of the `WARC-Type` header field). |
| * WARC 1.0 defines the following record types: (for full definitions, see the |
| * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf)) |
| * |
| * * `warcinfo`: Describes the records that follow it, up through end of file, |
| * end of input, or until next `warcinfo` record. Typically, this appears once and |
| * at the beginning of a WARC file. For a web archive, it often contains information |
| * about the web crawl which generated the following records. |
| * |
| * The format of this descriptive record block may vary, though the use of the |
| * `"application/warc-fields"` content-type is recommended. (...) |
| * |
| * * `response`: The record should contain a complete scheme-specific response, including |
| * network protocol information where possible. For a target-URI of the `http` or |
| * `https` schemes, a `response` record block should contain the full HTTP |
| * response received over the network, including headers. That is, it contains the |
| * 'Response' message defined by section 6 of HTTP/1.1 (RFC2616). |
| * |
| * The WARC record's Content-Type field should contain the value defined by HTTP/1.1, |
| * `"application/http;msgtype=response"`. The payload of the record is defined as its |
| * 'entity-body' (per RFC2616), with any transfer-encoding removed. |
| * |
| * * `resource`: The record contains a resource, without full protocol response |
| * information. For example: a file directly retrieved from a locally accessible |
| * repository or the result of a networked retrieval where the protocol information |
| * has been discarded. For a target-URI of the `http` or `https` schemes, a `resource` |
| * record block shall contain the returned 'entity-body' (per RFC2616, with any |
| * transfer-encodings removed), possibly truncated. |
| * |
| * * `request`: The record holds the details of a complete scheme-specific request, |
| * including network protocol information where possible. For a target-URI of the |
| * `http` or `https` schemes, a `request` record block should contain the full HTTP |
| * request sent over the network, including headers. That is, it contains the |
| * 'Request' message defined by section 5 of HTTP/1.1 (RFC2616). |
| * |
| * The WARC record's Content-Type field should contain the value defined by HTTP/1.1, |
| * `"application/http;msgtype=request"`. The payload of a `request` record with a |
| * target-URI of scheme `http` or `https` is defined as its 'entity-body' (per |
| * RFC2616), with any transfer-encoding removed. |
| * |
| * * `metadata`: The record contains content created in order to further describe, |
| * explain, or accompany a harvested resource, in ways not covered by other record |
| * types. A `metadata` record will almost always refer to another record of another |
| * type, with that other record holding original harvested or transformed content. |
| * |
| * The format of the metadata record block may vary. The `"application/warc-fields"` |
| * format may be used. |
| * |
| * * `revisit`: The record describes the revisitation of content already archived, |
| * and might include only an abbreviated content body which has to be interpreted |
| * relative to a previous record. Most typically, a `revisit` record is used |
| * instead of a `response` or `resource` record to indicate that the content |
| * visited was either a complete or substantial duplicate of material previously |
| * archived. |
| * |
| * A `revisit` record shall contain a WARC-Profile field which determines the |
| * interpretation of the record's fields and record block. Please see the |
| * specification for details. |
| * |
| * * `conversion`: The record shall contain an alternative version of another |
| * record's content that was created as the result of an archival process. |
| * Typically, this is used to hold content transformations that maintain viability |
| * of content after widely available rendering tools for the originally stored |
| * format disappear. As needed, the original content may be migrated (transformed) |
| * to a more viable format in order to keep the information usable with current |
| * tools while minimizing loss of information. |
| * |
| * * `continuation`: Record blocks from `continuation` records must be appended to |
| * corresponding prior record blocks (eg. from other WARC files) to create the |
| * logically complete full-sized original record. That is, `continuation` |
| * records are used when a record that would otherwise cause a WARC file size to |
| * exceed a desired limit is broken into segments. A continuation record shall |
| * contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`, |
| * and the last `continuation` record of a series shall contain a |
| * `WARC-Segment-Total-Length` field. Please see the specification for details. |
| * |
| * * Other record types may be added in future, so this list is not exclusive. |
| * |
| * @return The record's `WARC-Type` header field, as a string. |
| */ |
| public String getRecordType() { |
| return fields.get("WARC-Type"); |
| } |
| |
| /** |
| * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described |
| * in the W3C profile of ISO8601. The timestamp shall represent the instant that |
| * data capture for record creation began. Multiple records written as part of a |
| * single capture event shall use the same WARC-Date, even though the times of |
| * their writing will not be exactly synchronized. |
| * |
| * @return The record's `WARC-Date` header field, as a string. |
| */ |
| public String getDateString() { |
| return fields.get("WARC-Date"); |
| } |
| |
| /** |
| * An identifier assigned to the current record that is globally unique for its |
| * period of intended use. No identifier scheme is mandated by this specification, |
| * but each record-id shall be a legal URI and clearly indicate a documented and |
| * registered scheme to which it conforms (e.g., via a URI scheme prefix such as |
| * `http:` or `urn:`). |
| * |
| * @return The record's `WARC-Record-ID` header field, as a string. |
| */ |
| public String getRecordID() { |
| return fields.get("WARC-Record-ID"); |
| } |
| |
| /** |
| * The MIME type (RFC2045) of the information contained in the record's block. For |
| * example, in HTTP request and response records, this would be `application/http` |
| * as per section 19.1 of RFC2616 (or `application/http; msgtype=request` and |
| * `application/http; msgtype=response` respectively). |
| * |
| * In particular, the content-type is *not* the value of the HTTP Content-Type |
| * header in an HTTP response, but a MIME type to describe the full archived HTTP |
| * message (hence `application/http` if the block contains request or response |
| * headers). |
| * |
| * @return The record's `Content-Type` header field, as a string. |
| */ |
| public String getContentType() { |
| return fields.get("Content-Type"); |
| } |
| |
| /** |
| * The original URI whose capture gave rise to the information content in this record. |
| * In the context of web harvesting, this is the URI that was the target of a |
| * crawler's retrieval request. For a `revisit` record, it is the URI that was the |
| * target of a retrieval request. Indirectly, such as for a `metadata`, or `conversion` |
| * record, it is a copy of the `WARC-Target-URI` appearing in the original record to |
| * which the newer record pertains. The URI in this value shall be properly escaped |
| * according to RFC3986, and written with no internal whitespace. |
| * |
| * @return The record's `WARC-Target-URI` header field, as a string. |
| */ |
| public String getTargetURI() { |
| return fields.get("WARC-Target-URI"); |
| } |
| |
| /** |
| * The number of bytes in the body of the record, similar to RFC2616. |
| * |
| * @return The record's `Content-Length` header field, parsed into an int. |
| */ |
| public int getContentLength() { |
| String lengthStr = fields.get("Content-Length"); |
| if (lengthStr == null) { |
| throw new IllegalStateException("Missing Content-Length header"); |
| } |
| try { |
| return Integer.parseInt(lengthStr); |
| } catch (NumberFormatException e) { |
| throw new IllegalStateException("Malformed Content-Length header: " + lengthStr); |
| } |
| } |
| |
| /** |
| * Returns the value of a selected header field, or null if there is no header with |
| * that field name. |
| * @param field The name of the header to return (case-sensitive). |
| * @return The value associated with that field name, or null if not present. |
| */ |
| public String getField(String field) { |
| return fields.get(field); |
| } |
| |
| /** |
| * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format. |
| * @param out The data output to which the header should be written. |
| */ |
| public void write(DataOutput out) throws IOException { |
| out.write(toString().getBytes("UTF-8")); |
| } |
| |
| /** |
| * Formats this header in WARC/1.0 format, consisting of a version line followed |
| * by colon-delimited key-value pairs, and `\r\n` line endings. |
| */ |
| @Override |
| public String toString() { |
| StringBuilder buf = new StringBuilder(); |
| buf.append(WARC_VERSION); |
| buf.append(CRLF); |
| for (Map.Entry<String, String> field : fields.entrySet()) { |
| buf.append(field.getKey()); |
| buf.append(": "); |
| buf.append(field.getValue()); |
| buf.append(CRLF); |
| } |
| return buf.toString(); |
| } |
| } |
| |
| } |