| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.tools; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.UnsupportedEncodingException; |
| import java.net.InetAddress; |
| import java.net.UnknownHostException; |
| import java.util.Date; |
| import java.util.List; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.indexer.NutchDocument; |
| import org.apache.nutch.util.StringUtil; |
| import org.archive.format.http.HttpHeaders; |
| import org.archive.format.warc.WARCConstants; |
| import org.archive.io.warc.WARCRecordInfo; |
| import org.archive.uid.UUIDGenerator; |
| import org.archive.util.DateUtils; |
| import org.archive.util.anvl.ANVLRecord; |
| |
| public class WARCUtils { |
| public final static String SOFTWARE = "software"; |
| public final static String HTTP_HEADER_FROM = "http-header-from"; |
| public final static String HTTP_HEADER_USER_AGENT = "http-header-user-agent"; |
| public final static String HOSTNAME = "hostname"; |
| public final static String ROBOTS = "robots"; |
| public final static String OPERATOR = "operator"; |
| public final static String FORMAT = "format"; |
| public final static String CONFORMS_TO = "conformsTo"; |
| public final static String IP = "ip"; |
| public final static UUIDGenerator generator = new UUIDGenerator(); |
| |
| public static final ANVLRecord getWARCInfoContent(Configuration conf) { |
| ANVLRecord record = new ANVLRecord(); |
| |
| // informative headers |
| record.addLabelValue(FORMAT, "WARC File Format 1.0"); |
| record.addLabelValue(CONFORMS_TO, "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); |
| |
| record.addLabelValue(SOFTWARE, conf.get("http.agent.name", "")); |
| record.addLabelValue(HTTP_HEADER_USER_AGENT, |
| getAgentString(conf.get("http.agent.name", ""), |
| conf.get("http.agent.version", ""), |
| conf.get("http.agent.description", ""), |
| conf.get("http.agent.url", ""), |
| conf.get("http.agent.email", ""))); |
| record.addLabelValue(HTTP_HEADER_FROM, |
| conf.get("http.agent.email", "")); |
| |
| try { |
| record.addLabelValue(HOSTNAME, getHostname(conf)); |
| record.addLabelValue(IP, getIPAddress(conf)); |
| } catch (UnknownHostException ignored) { |
| // do nothing as this fields are optional |
| } |
| |
| record.addLabelValue(ROBOTS, "classic"); // TODO Make configurable? |
| record.addLabelValue(OPERATOR, conf.get("http.agent.email", "")); |
| |
| return record; |
| } |
| |
| public static final String getHostname(Configuration conf) |
| throws UnknownHostException { |
| |
| return StringUtil.isEmpty(conf.get("http.agent.host", "")) ? |
| InetAddress.getLocalHost().getHostName() : |
| conf.get("http.agent.host"); |
| } |
| |
| public static final String getIPAddress(Configuration conf) |
| throws UnknownHostException { |
| |
| return InetAddress.getLocalHost().getHostAddress(); |
| } |
| |
| public static final byte[] toByteArray(HttpHeaders headers) |
| throws IOException { |
| ByteArrayOutputStream out = new ByteArrayOutputStream(); |
| headers.write(out); |
| |
| return out.toByteArray(); |
| } |
| |
| public static final String getAgentString(String name, String version, |
| String description, String URL, String email) { |
| |
| StringBuffer buf = new StringBuffer(); |
| |
| buf.append(name); |
| |
| if (version != null) { |
| buf.append("/").append(version); |
| } |
| |
| if (((description != null) && (description.length() != 0)) || ( |
| (email != null) && (email.length() != 0)) || ((URL != null) && ( |
| URL.length() != 0))) { |
| buf.append(" ("); |
| |
| if ((description != null) && (description.length() != 0)) { |
| buf.append(description); |
| if ((URL != null) || (email != null)) |
| buf.append("; "); |
| } |
| |
| if ((URL != null) && (URL.length() != 0)) { |
| buf.append(URL); |
| if (email != null) |
| buf.append("; "); |
| } |
| |
| if ((email != null) && (email.length() != 0)) |
| buf.append(email); |
| |
| buf.append(")"); |
| } |
| |
| return buf.toString(); |
| } |
| |
| public static final WARCRecordInfo docToMetadata(NutchDocument doc) |
| throws UnsupportedEncodingException { |
| WARCRecordInfo record = new WARCRecordInfo(); |
| |
| record.setType(WARCConstants.WARCRecordType.metadata); |
| record.setUrl((String) doc.getFieldValue("id")); |
| record.setCreate14DigitDate( |
| DateUtils.get14DigitDate((Date) doc.getFieldValue("tstamp"))); |
| record.setMimetype("application/warc-fields"); |
| record.setRecordId(generator.getRecordID()); |
| |
| // metadata |
| ANVLRecord metadata = new ANVLRecord(); |
| |
| for (String field : doc.getFieldNames()) { |
| List<Object> values = doc.getField(field).getValues(); |
| for (Object value : values) { |
| if (value instanceof Date) { |
| metadata.addLabelValue(field, DateUtils.get14DigitDate()); |
| } else { |
| metadata.addLabelValue(field, (String) value); |
| } |
| } |
| } |
| |
| record.setContentLength(metadata.getLength()); |
| record.setContentStream( |
| new ByteArrayInputStream(metadata.getUTF8Bytes())); |
| |
| return record; |
| } |
| } |