blob: b93bc476163e7dd6dc0435a0bb5ce2a5e9163da3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.commons.cli.Options;
import org.apache.nutch.util.NutchConfiguration;
/* The text conversion of page's content, stored using gzip compression.
* @see Parse#getText()
*/
public final class ParseText implements Writable {
public static final String DIR_NAME = "parse_text";
private static final byte VERSION = 2;
public ParseText() {
//default constructor
}
private String text;
public ParseText(String text) {
this.text = text;
}
public void readFields(DataInput in) throws IOException {
byte version = in.readByte();
switch (version) {
case 1:
text = WritableUtils.readCompressedString(in);
break;
case VERSION:
text = Text.readString(in);
break;
default:
throw new VersionMismatchException(VERSION, version);
}
}
public final void write(DataOutput out) throws IOException {
out.write(VERSION);
Text.writeString(out, text);
}
public static final ParseText read(DataInput in) throws IOException {
ParseText parseText = new ParseText();
parseText.readFields(in);
return parseText;
}
//
// Accessor methods
//
public String getText() {
return text;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof ParseText))
return false;
ParseText other = (ParseText) o;
return this.text.equals(other.text);
}
@Override
public String toString() {
return text;
}
public static void main(String argv[]) throws Exception {
String usage = "ParseText (-local | -dfs <namenode:port>) recno segment";
if (argv.length < 3) {
System.out.println("usage:" + usage);
return;
}
Options opts = new Options();
Configuration conf = NutchConfiguration.create();
GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
String[] remainingArgs = parser.getRemainingArgs();
try (FileSystem fs = FileSystem.get(conf)) {
int recno = Integer.parseInt(remainingArgs[0]);
String segment = remainingArgs[1];
String filename = new Path(segment, ParseText.DIR_NAME).toString();
ParseText parseText = new ParseText();
ArrayFile.Reader parseTexts = new ArrayFile.Reader(fs, filename, conf);
parseTexts.get(recno, parseText);
System.out.println("Retrieved " + recno + " from file " + filename);
System.out.println(parseText);
parseTexts.close();
}
}
}