blob: e88c7acae182b62fcea8c81ea2b9d3299ce64c51 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import org.apache.commons.cli.Options;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.VersionedWritable;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
/**
* Data extracted from a page's content.
*
* @see Parse#getData()
*/
public final class ParseData extends VersionedWritable {
public static final String DIR_NAME = "parse_data";
private static final byte VERSION = 5;
private String title;
private Outlink[] outlinks;
private Metadata contentMeta;
private Metadata parseMeta;
private ParseStatus status;
private byte version = VERSION;
public ParseData() {
contentMeta = new Metadata();
parseMeta = new Metadata();
}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
Metadata contentMeta) {
this(status, title, outlinks, contentMeta, new Metadata());
}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
Metadata contentMeta, Metadata parseMeta) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
this.contentMeta = contentMeta;
this.parseMeta = parseMeta;
}
//
// Accessor methods
//
/** The status of parsing the page. */
public ParseStatus getStatus() {
return status;
}
/** The title of the page. */
public String getTitle() {
return title;
}
/** The outlinks of the page. */
public Outlink[] getOutlinks() {
return outlinks;
}
/** The original Metadata retrieved from content */
public Metadata getContentMeta() {
return contentMeta;
}
/**
* Other content properties. This is the place to find format-specific
* properties. Different parser implementations for different content types
* will populate this differently.
*/
public Metadata getParseMeta() {
return parseMeta;
}
public void setParseMeta(Metadata parseMeta) {
this.parseMeta = parseMeta;
}
public void setOutlinks(Outlink[] outlinks) {
this.outlinks = outlinks;
}
/**
* Get a metadata single value. This method first looks for the metadata value
* in the parse metadata. If no value is found it the looks for the metadata
* in the content metadata.
*
* @see #getContentMeta()
* @see #getParseMeta()
*/
public String getMeta(String name) {
String value = parseMeta.get(name);
if (value == null) {
value = contentMeta.get(name);
}
return value;
}
//
// Writable methods
//
public byte getVersion() {
return version;
}
public final void readFields(DataInput in) throws IOException {
version = in.readByte();
// incompatible change from UTF8 (version < 5) to Text
if (version != VERSION)
throw new VersionMismatchException(VERSION, version);
status = ParseStatus.read(in);
title = Text.readString(in); // read title
int numOutlinks = in.readInt();
outlinks = new Outlink[numOutlinks];
for (int i = 0; i < numOutlinks; i++) {
outlinks[i] = Outlink.read(in);
}
contentMeta.clear();
contentMeta.readFields(in);
parseMeta.clear();
parseMeta.readFields(in);
}
public final void write(DataOutput out) throws IOException {
out.writeByte(VERSION); // write version
status.write(out); // write status
Text.writeString(out, title); // write title
out.writeInt(outlinks.length); // write outlinks
for (int i = 0; i < outlinks.length; i++) {
outlinks[i].write(out);
}
contentMeta.write(out); // write content metadata
parseMeta.write(out);
}
public static ParseData read(DataInput in) throws IOException {
ParseData parseText = new ParseData();
parseText.readFields(in);
return parseText;
}
//
// other methods
//
public boolean equals(Object o) {
if (!(o instanceof ParseData))
return false;
ParseData other = (ParseData) o;
return this.status.equals(other.status) && this.title.equals(other.title)
&& Arrays.equals(this.outlinks, other.outlinks)
&& this.contentMeta.equals(other.contentMeta)
&& this.parseMeta.equals(other.parseMeta);
}
public String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append("Version: " + version + "\n");
buffer.append("Status: " + status + "\n");
buffer.append("Title: " + title + "\n");
if (outlinks != null) {
buffer.append("Outlinks: " + outlinks.length + "\n");
for (int i = 0; i < outlinks.length; i++) {
buffer.append(" outlink: " + outlinks[i] + "\n");
}
}
buffer.append("Content Metadata: " + contentMeta + "\n");
buffer.append("Parse Metadata: " + parseMeta + "\n");
return buffer.toString();
}
public static void main(String argv[]) throws Exception {
String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
if (argv.length < 3) {
System.out.println("usage:" + usage);
return;
}
Options opts = new Options();
Configuration conf = NutchConfiguration.create();
GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
String[] remainingArgs = parser.getRemainingArgs();
try (FileSystem fs = FileSystem.get(conf)) {
int recno = Integer.parseInt(remainingArgs[0]);
String segment = remainingArgs[1];
Path file = new Path(segment, DIR_NAME);
System.out.println("Reading from file: " + file);
ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf);
ParseData parseDatum = new ParseData();
parses.get(recno, parseDatum);
System.out.println("Retrieved " + recno + " from file " + file);
System.out.println(parseDatum);
parses.close();
}
}
}