blob: a8807832acfe283f207c939be73b967623de5f96 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.util.StringUtil;
import org.archive.format.http.HttpHeaders;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.uid.UUIDGenerator;
import org.archive.util.DateUtils;
import org.archive.util.anvl.ANVLRecord;
public class WARCUtils {
public final static String SOFTWARE = "software";
public final static String HTTP_HEADER_FROM = "http-header-from";
public final static String HTTP_HEADER_USER_AGENT = "http-header-user-agent";
public final static String HOSTNAME = "hostname";
public final static String ROBOTS = "robots";
public final static String OPERATOR = "operator";
public final static String FORMAT = "format";
public final static String CONFORMS_TO = "conformsTo";
public final static String IP = "ip";
public final static UUIDGenerator generator = new UUIDGenerator();
public static final ANVLRecord getWARCInfoContent(Configuration conf) {
ANVLRecord record = new ANVLRecord();
// informative headers
record.addLabelValue(FORMAT, "WARC File Format 1.0");
record.addLabelValue(CONFORMS_TO, "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
record.addLabelValue(SOFTWARE, conf.get("http.agent.name", ""));
record.addLabelValue(HTTP_HEADER_USER_AGENT,
getAgentString(conf.get("http.agent.name", ""),
conf.get("http.agent.version", ""),
conf.get("http.agent.description", ""),
conf.get("http.agent.url", ""),
conf.get("http.agent.email", "")));
record.addLabelValue(HTTP_HEADER_FROM,
conf.get("http.agent.email", ""));
try {
record.addLabelValue(HOSTNAME, getHostname(conf));
record.addLabelValue(IP, getIPAddress(conf));
} catch (UnknownHostException ignored) {
// do nothing as this fields are optional
}
record.addLabelValue(ROBOTS, "classic"); // TODO Make configurable?
record.addLabelValue(OPERATOR, conf.get("http.agent.email", ""));
return record;
}
public static final String getHostname(Configuration conf)
throws UnknownHostException {
return StringUtil.isEmpty(conf.get("http.agent.host", "")) ?
InetAddress.getLocalHost().getHostName() :
conf.get("http.agent.host");
}
public static final String getIPAddress(Configuration conf)
throws UnknownHostException {
return InetAddress.getLocalHost().getHostAddress();
}
public static final byte[] toByteArray(HttpHeaders headers)
throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
headers.write(out);
return out.toByteArray();
}
public static final String getAgentString(String name, String version,
String description, String URL, String email) {
StringBuffer buf = new StringBuffer();
buf.append(name);
if (version != null) {
buf.append("/").append(version);
}
if (((description != null) && (description.length() != 0)) || (
(email != null) && (email.length() != 0)) || ((URL != null) && (
URL.length() != 0))) {
buf.append(" (");
if ((description != null) && (description.length() != 0)) {
buf.append(description);
if ((URL != null) || (email != null))
buf.append("; ");
}
if ((URL != null) && (URL.length() != 0)) {
buf.append(URL);
if (email != null)
buf.append("; ");
}
if ((email != null) && (email.length() != 0))
buf.append(email);
buf.append(")");
}
return buf.toString();
}
public static final WARCRecordInfo docToMetadata(NutchDocument doc)
throws UnsupportedEncodingException {
WARCRecordInfo record = new WARCRecordInfo();
record.setType(WARCConstants.WARCRecordType.metadata);
record.setUrl((String) doc.getFieldValue("id"));
record.setCreate14DigitDate(
DateUtils.get14DigitDate((Date) doc.getFieldValue("tstamp")));
record.setMimetype("application/warc-fields");
record.setRecordId(generator.getRecordID());
// metadata
ANVLRecord metadata = new ANVLRecord();
for (String field : doc.getFieldNames()) {
List<Object> values = doc.getField(field).getValues();
for (Object value : values) {
if (value instanceof Date) {
metadata.addLabelValue(field, DateUtils.get14DigitDate());
} else {
metadata.addLabelValue(field, (String) value);
}
}
}
record.setContentLength(metadata.getLength());
record.setContentStream(
new ByteArrayInputStream(metadata.getUTF8Bytes()));
return record;
}
}