blob: b90cfe581f4c26ea868dfa0094ac5d16972e8838 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
/**
* Reads and prints to system out information for a single node from the NodeDb
* in the WebGraph.
*/
public class NodeReader extends Configured {
private MapFile.Reader[] nodeReaders;
public NodeReader() {
}
public NodeReader(Configuration conf) {
super(conf);
}
/**
* Prints the content of the Node represented by the url to system out.
*
* @param webGraphDb
* The webgraph from which to get the node.
* @param url
* The url of the node.
*
* @throws IOException
* If an error occurs while getting the node.
*/
public void dumpUrl(Path webGraphDb, String url) throws IOException {
nodeReaders = MapFileOutputFormat.getReaders(new Path(webGraphDb,
WebGraph.NODE_DIR), getConf());
// open the readers, get the node, print out the info, and close the readers
Text key = new Text(url);
Node node = new Node();
MapFileOutputFormat.getEntry(nodeReaders,
new HashPartitioner<>(), key, node);
System.out.println(url + ":");
System.out.println(" inlink score: " + node.getInlinkScore());
System.out.println(" outlink score: " + node.getOutlinkScore());
System.out.println(" num inlinks: " + node.getNumInlinks());
System.out.println(" num outlinks: " + node.getNumOutlinks());
FSUtils.closeReaders(nodeReaders);
}
/**
* Runs the NodeReader tool. The command line arguments must contain a
* webgraphdb path and a url. The url must match the normalized url that is
* contained in the NodeDb of the WebGraph.
*/
public static void main(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the webgraphdb to use");
Option webGraphOpts = OptionBuilder.create("webgraphdb");
options.addOption(webGraphOpts);
OptionBuilder.withArgName("url");
OptionBuilder.hasOptionalArg();
OptionBuilder.withDescription("the url to dump");
Option urlOpts = OptionBuilder.create("url");
options.addOption(urlOpts);
CommandLineParser parser = new GnuParser();
try {
// command line must take a webgraphdb and a url
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")
|| !line.hasOption("url")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("WebGraphReader", options);
return;
}
// dump the values to system out and return
String webGraphDb = line.getOptionValue("webgraphdb");
String url = line.getOptionValue("url");
NodeReader reader = new NodeReader(NutchConfiguration.create());
reader.dumpUrl(new Path(webGraphDb), url);
return;
} catch (Exception e) {
e.printStackTrace();
return;
}
}
}