blob: 63dbde8941269b47b265b3432c779dd0436ecd61 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilterReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.Random;
import java.util.Vector;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.xerces.util.XMLChar;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.util.NutchConfiguration;
/**
* Utility that converts <a href="http://www.dmoztools.net/">DMOZ</a>
* RDF into a flat file of URLs to be injected.
*/
public class DmozParser {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
long pages = 0;
/**
* This filter fixes characters that might offend our parser. This lets us be
* tolerant of errors that might appear in the input XML.
*/
private static class XMLCharFilter extends FilterReader {
private boolean lastBad = false;
public XMLCharFilter(Reader reader) {
super(reader);
}
public int read() throws IOException {
int c = in.read();
int value = c;
if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
value = 'X';
else if (lastBad && c == '<') { // fix mis-matched brackets
in.mark(1);
if (in.read() != '/')
value = 'X';
in.reset();
}
lastBad = (c == 65533);
return value;
}
public int read(char[] cbuf, int off, int len) throws IOException {
int n = in.read(cbuf, off, len);
if (n != -1) {
for (int i = 0; i < n; i++) {
char c = cbuf[off + i];
char value = c;
if (!(XMLChar.isValid(c))) // fix invalid characters
value = 'X';
else if (lastBad && c == '<') { // fix mis-matched brackets
if (i != n - 1 && cbuf[off + i + 1] != '/')
value = 'X';
}
lastBad = (c == 65533);
cbuf[off + i] = value;
}
}
return n;
}
}
/**
* The RDFProcessor receives tag messages during a parse of RDF XML data. We
* build whatever structures we need from these messages.
*/
private class RDFProcessor extends DefaultHandler {
String curURL = null, curSection = null;
boolean titlePending = false, descPending = false;
Pattern topicPattern = null;
StringBuffer title = new StringBuffer(), desc = new StringBuffer();
@SuppressWarnings("unused")
XMLReader reader;
int subsetDenom;
int hashSkew;
boolean includeAdult;
Locator location;
/**
* Pass in an XMLReader, plus a flag as to whether we should include adult
* material.
*/
public RDFProcessor(XMLReader reader, int subsetDenom,
boolean includeAdult, int skew, Pattern topicPattern)
throws IOException {
this.reader = reader;
this.subsetDenom = subsetDenom;
this.includeAdult = includeAdult;
this.topicPattern = topicPattern;
this.hashSkew = skew != 0 ? skew : new Random().nextInt();
}
//
// Interface ContentHandler
//
/**
* Start of an XML elt
*/
public void startElement(String namespaceURI, String localName,
String qName, Attributes atts) throws SAXException {
if ("Topic".equals(qName)) {
curSection = atts.getValue("r:id");
} else if ("ExternalPage".equals(qName)) {
// Porn filter
if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
return;
}
if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
return;
}
// Subset denominator filter.
// Only emit with a chance of 1/denominator.
String url = atts.getValue("about");
int hashValue = MD5Hash.digest(url).hashCode();
hashValue = Math.abs(hashValue ^ hashSkew);
if ((hashValue % subsetDenom) != 0) {
return;
}
// We actually claim the URL!
curURL = url;
} else if (curURL != null && "d:Title".equals(qName)) {
titlePending = true;
} else if (curURL != null && "d:Description".equals(qName)) {
descPending = true;
}
}
/**
* The contents of an XML elt
*/
public void characters(char ch[], int start, int length) {
if (titlePending) {
title.append(ch, start, length);
} else if (descPending) {
desc.append(ch, start, length);
}
}
/**
* Termination of XML elt
*/
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (curURL != null) {
if ("ExternalPage".equals(qName)) {
//
// Inc the number of pages, insert the page, and
// possibly print status.
//
System.out.println(curURL);
pages++;
//
// Clear out the link text. This is what
// you would use for adding to the linkdb.
//
if (title.length() > 0) {
title.delete(0, title.length());
}
if (desc.length() > 0) {
desc.delete(0, desc.length());
}
// Null out the URL.
curURL = null;
} else if ("d:Title".equals(qName)) {
titlePending = false;
} else if ("d:Description".equals(qName)) {
descPending = false;
}
}
}
/**
* When parsing begins
*/
public void startDocument() {
LOG.info("Begin parse");
}
/**
* When parsing ends
*/
public void endDocument() {
LOG.info("Completed parse. Found " + pages + " pages.");
}
/**
* From time to time the Parser will set the "current location" by calling
* this function. It's useful for emitting locations for error messages.
*/
public void setDocumentLocator(Locator locator) {
location = locator;
}
//
// Interface ErrorHandler
//
/**
* Emit the exception message
*/
public void error(SAXParseException spe) {
if (LOG.isErrorEnabled()) {
LOG.error("Error: " + spe.toString() + ": " + spe.getMessage());
}
}
/**
* Emit exception warning message
*/
public void warning(SAXParseException spe) {
if (LOG.isWarnEnabled()) {
LOG.warn("Warning: " + spe.toString() + ": " + spe.getMessage());
}
}
}
/**
* Iterate through all the items in this structured DMOZ file. Add each URL to
* the web db.
*/
public void parseDmozFile(File dmozFile, int subsetDenom,
boolean includeAdult, int skew, Pattern topicPattern)
throws IOException, SAXException, ParserConfigurationException {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
XMLReader reader = parser.getXMLReader();
// Create our own processor to receive SAX events
RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
topicPattern);
reader.setContentHandler(rp);
reader.setErrorHandler(rp);
LOG.info("skew = " + rp.hashSkew);
//
// Open filtered text stream. The TextFilter makes sure that
// only appropriate XML-approved Text characters are received.
// Any non-conforming characters are silently skipped.
//
try (XMLCharFilter in = new XMLCharFilter(new BufferedReader(
new InputStreamReader(new BufferedInputStream(new FileInputStream(
dmozFile)), "UTF-8")))) {
InputSource is = new InputSource(in);
reader.parse(is);
} catch (Exception e) {
if (LOG.isErrorEnabled()) {
LOG.error(e.toString());
}
System.exit(0);
}
}
private static void addTopicsFromFile(String topicFile, Vector<String> topics)
throws IOException {
try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
topicFile), "UTF-8"))) {
String line = null;
while ((line = in.readLine()) != null) {
topics.addElement(line);
}
} catch (Exception e) {
if (LOG.isErrorEnabled()) {
LOG.error(e.toString());
}
System.exit(0);
}
}
/**
* Command-line access. User may add URLs via a flat text file or the
* structured DMOZ file. By default, we ignore Adult material (as categorized
* by DMOZ).
*/
public static void main(String[] argv) throws Exception {
if (argv.length < 1) {
System.err
.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
return;
}
//
// Parse the command line, figure out what kind of
// URL file we need to load
//
int subsetDenom = 1;
int skew = 0;
String dmozFile = argv[0];
boolean includeAdult = false;
Pattern topicPattern = null;
Vector<String> topics = new Vector<>();
Configuration conf = NutchConfiguration.create();
try (FileSystem fs = FileSystem.get(conf)) {
for (int i = 1; i < argv.length; i++) {
if ("-includeAdultMaterial".equals(argv[i])) {
includeAdult = true;
} else if ("-subset".equals(argv[i])) {
subsetDenom = Integer.parseInt(argv[i + 1]);
i++;
} else if ("-topic".equals(argv[i])) {
topics.addElement(argv[i + 1]);
i++;
} else if ("-topicFile".equals(argv[i])) {
addTopicsFromFile(argv[i + 1], topics);
i++;
} else if ("-skew".equals(argv[i])) {
skew = Integer.parseInt(argv[i + 1]);
i++;
}
}
DmozParser parser = new DmozParser();
if (!topics.isEmpty()) {
String regExp = "^(";
int j = 0;
for (; j < topics.size() - 1; ++j) {
regExp = regExp.concat(topics.get(j));
regExp = regExp.concat("|");
}
regExp = regExp.concat(topics.get(j));
regExp = regExp.concat(").*");
LOG.info("Topic selection pattern = " + regExp);
topicPattern = Pattern.compile(regExp);
}
parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
topicPattern);
}
}
}