src/java/org/apache/nutch/tools/DmozParser.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.tools;

 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FilterReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.Random;
 import java.util.Vector;
 import java.util.regex.Pattern;

 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;

 import org.apache.xerces.util.XMLChar;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXParseException;
 import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.DefaultHandler;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.MD5Hash;
 import org.apache.nutch.util.NutchConfiguration;

 /**
  * Utility that converts <a href="http://www.dmoztools.net/">DMOZ</a>
  * RDF into a flat file of URLs to be injected.
  */
 public class DmozParser {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   long pages = 0;

   /**
    * This filter fixes characters that might offend our parser. This lets us be
    * tolerant of errors that might appear in the input XML.
    */
   private static class XMLCharFilter extends FilterReader {
     private boolean lastBad = false;

     public XMLCharFilter(Reader reader) {
       super(reader);
     }

     public int read() throws IOException {
       int c = in.read();
       int value = c;
       if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
         value = 'X';
       else if (lastBad && c == '<') { // fix mis-matched brackets
         in.mark(1);
         if (in.read() != '/')
           value = 'X';
         in.reset();
       }
       lastBad = (c == 65533);

       return value;
     }

     public int read(char[] cbuf, int off, int len) throws IOException {
       int n = in.read(cbuf, off, len);
       if (n != -1) {
         for (int i = 0; i < n; i++) {
           char c = cbuf[off + i];
           char value = c;
           if (!(XMLChar.isValid(c))) // fix invalid characters
             value = 'X';
           else if (lastBad && c == '<') { // fix mis-matched brackets
             if (i != n - 1 && cbuf[off + i + 1] != '/')
               value = 'X';
           }
           lastBad = (c == 65533);
           cbuf[off + i] = value;
         }
       }
       return n;
     }
   }

   /**
    * The RDFProcessor receives tag messages during a parse of RDF XML data. We
    * build whatever structures we need from these messages.
    */
   private class RDFProcessor extends DefaultHandler {
     String curURL = null, curSection = null;
     boolean titlePending = false, descPending = false;
     Pattern topicPattern = null;
     StringBuffer title = new StringBuffer(), desc = new StringBuffer();
     @SuppressWarnings("unused")
     XMLReader reader;
     int subsetDenom;
     int hashSkew;
     boolean includeAdult;
     Locator location;

     /**
      * Pass in an XMLReader, plus a flag as to whether we should include adult
      * material.
      */
     public RDFProcessor(XMLReader reader, int subsetDenom,
         boolean includeAdult, int skew, Pattern topicPattern)
         throws IOException {
       this.reader = reader;
       this.subsetDenom = subsetDenom;
       this.includeAdult = includeAdult;
       this.topicPattern = topicPattern;

       this.hashSkew = skew != 0 ? skew : new Random().nextInt();
     }

     //
     // Interface ContentHandler
     //

     /**
      * Start of an XML elt
      */
     public void startElement(String namespaceURI, String localName,
         String qName, Attributes atts) throws SAXException {
       if ("Topic".equals(qName)) {
         curSection = atts.getValue("r:id");
       } else if ("ExternalPage".equals(qName)) {
         // Porn filter
         if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
           return;
         }

         if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
           return;
         }

         // Subset denominator filter.
         // Only emit with a chance of 1/denominator.
         String url = atts.getValue("about");
         int hashValue = MD5Hash.digest(url).hashCode();
         hashValue = Math.abs(hashValue ^ hashSkew);
         if ((hashValue % subsetDenom) != 0) {
           return;
         }

         // We actually claim the URL!
         curURL = url;
       } else if (curURL != null && "d:Title".equals(qName)) {
         titlePending = true;
       } else if (curURL != null && "d:Description".equals(qName)) {
         descPending = true;
       }
     }

     /**
      * The contents of an XML elt
      */
     public void characters(char ch[], int start, int length) {
       if (titlePending) {
         title.append(ch, start, length);
       } else if (descPending) {
         desc.append(ch, start, length);
       }
     }

     /**
      * Termination of XML elt
      */
     public void endElement(String namespaceURI, String localName, String qName)
         throws SAXException {
       if (curURL != null) {
         if ("ExternalPage".equals(qName)) {
           //
           // Inc the number of pages, insert the page, and
           // possibly print status.
           //
           System.out.println(curURL);
           pages++;

           //
           // Clear out the link text. This is what
           // you would use for adding to the linkdb.
           //
           if (title.length() > 0) {
             title.delete(0, title.length());
           }
           if (desc.length() > 0) {
             desc.delete(0, desc.length());
           }

           // Null out the URL.
           curURL = null;
         } else if ("d:Title".equals(qName)) {
           titlePending = false;
         } else if ("d:Description".equals(qName)) {
           descPending = false;
         }
       }
     }

     /**
      * When parsing begins
      */
     public void startDocument() {
       LOG.info("Begin parse");
     }

     /**
      * When parsing ends
      */
     public void endDocument() {
       LOG.info("Completed parse.  Found " + pages + " pages.");
     }

     /**
      * From time to time the Parser will set the "current location" by calling
      * this function. It's useful for emitting locations for error messages.
      */
     public void setDocumentLocator(Locator locator) {
       location = locator;
     }

     //
     // Interface ErrorHandler
     //

     /**
      * Emit the exception message
      */
     public void error(SAXParseException spe) {
       if (LOG.isErrorEnabled()) {
         LOG.error("Error: " + spe.toString() + ": " + spe.getMessage());
       }
     }

     /**
      * Emit exception warning message
      */
     public void warning(SAXParseException spe) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Warning: " + spe.toString() + ": " + spe.getMessage());
       }
     }
   }

   /**
    * Iterate through all the items in this structured DMOZ file. Add each URL to
    * the web db.
    */
   public void parseDmozFile(File dmozFile, int subsetDenom,
       boolean includeAdult, int skew, Pattern topicPattern)
               throws IOException, SAXException, ParserConfigurationException {

     SAXParserFactory parserFactory = SAXParserFactory.newInstance();
     SAXParser parser = parserFactory.newSAXParser();
     XMLReader reader = parser.getXMLReader();

     // Create our own processor to receive SAX events
     RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
         topicPattern);
     reader.setContentHandler(rp);
     reader.setErrorHandler(rp);
     LOG.info("skew = " + rp.hashSkew);

     //
     // Open filtered text stream. The TextFilter makes sure that
     // only appropriate XML-approved Text characters are received.
     // Any non-conforming characters are silently skipped.
     //
     try (XMLCharFilter in = new XMLCharFilter(new BufferedReader(
         new InputStreamReader(new BufferedInputStream(new FileInputStream(
             dmozFile)), "UTF-8")))) {
       InputSource is = new InputSource(in);
       reader.parse(is);
     } catch (Exception e) {
       if (LOG.isErrorEnabled()) {
         LOG.error(e.toString());
       }
       System.exit(0);
     }
   }

   private static void addTopicsFromFile(String topicFile, Vector<String> topics)
       throws IOException {
     try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
         topicFile), "UTF-8"))) {
       String line = null;
       while ((line = in.readLine()) != null) {
         topics.addElement(line);
       }
     } catch (Exception e) {
       if (LOG.isErrorEnabled()) {
         LOG.error(e.toString());
       }
       System.exit(0);
     }
   }

   /**
    * Command-line access. User may add URLs via a flat text file or the
    * structured DMOZ file. By default, we ignore Adult material (as categorized
    * by DMOZ).
    */
   public static void main(String[] argv) throws Exception {
     if (argv.length < 1) {
       System.err
           .println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
       return;
     }

     //
     // Parse the command line, figure out what kind of
     // URL file we need to load
     //
     int subsetDenom = 1;
     int skew = 0;
     String dmozFile = argv[0];
     boolean includeAdult = false;
     Pattern topicPattern = null;
     Vector<String> topics = new Vector<>();

     Configuration conf = NutchConfiguration.create();
     try (FileSystem fs = FileSystem.get(conf)) {
       for (int i = 1; i < argv.length; i++) {
         if ("-includeAdultMaterial".equals(argv[i])) {
           includeAdult = true;
         } else if ("-subset".equals(argv[i])) {
           subsetDenom = Integer.parseInt(argv[i + 1]);
           i++;
         } else if ("-topic".equals(argv[i])) {
           topics.addElement(argv[i + 1]);
           i++;
         } else if ("-topicFile".equals(argv[i])) {
           addTopicsFromFile(argv[i + 1], topics);
           i++;
         } else if ("-skew".equals(argv[i])) {
           skew = Integer.parseInt(argv[i + 1]);
           i++;
         }
       }

       DmozParser parser = new DmozParser();

       if (!topics.isEmpty()) {
         String regExp = "^(";
         int j = 0;
         for (; j < topics.size() - 1; ++j) {
           regExp = regExp.concat(topics.get(j));
           regExp = regExp.concat("|");
         }
         regExp = regExp.concat(topics.get(j));
         regExp = regExp.concat(").*");
         LOG.info("Topic selection pattern = " + regExp);
         topicPattern = Pattern.compile(regExp);
       }

       parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
           topicPattern);

     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.tools;

	import java.io.BufferedInputStream;
	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FilterReader;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.lang.invoke.MethodHandles;
	import java.util.Random;
	import java.util.Vector;
	import java.util.regex.Pattern;

	import javax.xml.parsers.ParserConfigurationException;
	import javax.xml.parsers.SAXParser;
	import javax.xml.parsers.SAXParserFactory;

	import org.apache.xerces.util.XMLChar;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.xml.sax.Attributes;
	import org.xml.sax.InputSource;
	import org.xml.sax.Locator;
	import org.xml.sax.SAXException;
	import org.xml.sax.SAXParseException;
	import org.xml.sax.XMLReader;
	import org.xml.sax.helpers.DefaultHandler;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.io.MD5Hash;
	import org.apache.nutch.util.NutchConfiguration;

	/**
	* Utility that converts <a href="http://www.dmoztools.net/">DMOZ</a>
	* RDF into a flat file of URLs to be injected.
	*/
	public class DmozParser {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	long pages = 0;

	/**
	* This filter fixes characters that might offend our parser. This lets us be
	* tolerant of errors that might appear in the input XML.
	*/
	private static class XMLCharFilter extends FilterReader {
	private boolean lastBad = false;

	public XMLCharFilter(Reader reader) {
	super(reader);
	}

	public int read() throws IOException {
	int c = in.read();
	int value = c;
	if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
	value = 'X';
	else if (lastBad && c == '<') { // fix mis-matched brackets
	in.mark(1);
	if (in.read() != '/')
	value = 'X';
	in.reset();
	}
	lastBad = (c == 65533);

	return value;
	}

	public int read(char[] cbuf, int off, int len) throws IOException {
	int n = in.read(cbuf, off, len);
	if (n != -1) {
	for (int i = 0; i < n; i++) {
	char c = cbuf[off + i];
	char value = c;
	if (!(XMLChar.isValid(c))) // fix invalid characters
	value = 'X';
	else if (lastBad && c == '<') { // fix mis-matched brackets
	if (i != n - 1 && cbuf[off + i + 1] != '/')
	value = 'X';
	}
	lastBad = (c == 65533);
	cbuf[off + i] = value;
	}
	}
	return n;
	}
	}

	/**
	* The RDFProcessor receives tag messages during a parse of RDF XML data. We
	* build whatever structures we need from these messages.
	*/
	private class RDFProcessor extends DefaultHandler {
	String curURL = null, curSection = null;
	boolean titlePending = false, descPending = false;
	Pattern topicPattern = null;
	StringBuffer title = new StringBuffer(), desc = new StringBuffer();
	@SuppressWarnings("unused")
	XMLReader reader;
	int subsetDenom;
	int hashSkew;
	boolean includeAdult;
	Locator location;

	/**
	* Pass in an XMLReader, plus a flag as to whether we should include adult
	* material.
	*/
	public RDFProcessor(XMLReader reader, int subsetDenom,
	boolean includeAdult, int skew, Pattern topicPattern)
	throws IOException {
	this.reader = reader;
	this.subsetDenom = subsetDenom;
	this.includeAdult = includeAdult;
	this.topicPattern = topicPattern;

	this.hashSkew = skew != 0 ? skew : new Random().nextInt();
	}

	//
	// Interface ContentHandler
	//

	/**
	* Start of an XML elt
	*/
	public void startElement(String namespaceURI, String localName,
	String qName, Attributes atts) throws SAXException {
	if ("Topic".equals(qName)) {
	curSection = atts.getValue("r:id");
	} else if ("ExternalPage".equals(qName)) {
	// Porn filter
	if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
	return;
	}

	if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
	return;
	}

	// Subset denominator filter.
	// Only emit with a chance of 1/denominator.
	String url = atts.getValue("about");
	int hashValue = MD5Hash.digest(url).hashCode();
	hashValue = Math.abs(hashValue ^ hashSkew);
	if ((hashValue % subsetDenom) != 0) {
	return;
	}

	// We actually claim the URL!
	curURL = url;
	} else if (curURL != null && "d:Title".equals(qName)) {
	titlePending = true;
	} else if (curURL != null && "d:Description".equals(qName)) {
	descPending = true;
	}
	}

	/**
	* The contents of an XML elt
	*/
	public void characters(char ch[], int start, int length) {
	if (titlePending) {
	title.append(ch, start, length);
	} else if (descPending) {
	desc.append(ch, start, length);
	}
	}

	/**
	* Termination of XML elt
	*/
	public void endElement(String namespaceURI, String localName, String qName)
	throws SAXException {
	if (curURL != null) {
	if ("ExternalPage".equals(qName)) {
	//
	// Inc the number of pages, insert the page, and
	// possibly print status.
	//
	System.out.println(curURL);
	pages++;

	//
	// Clear out the link text. This is what
	// you would use for adding to the linkdb.
	//
	if (title.length() > 0) {
	title.delete(0, title.length());
	}
	if (desc.length() > 0) {
	desc.delete(0, desc.length());
	}

	// Null out the URL.
	curURL = null;
	} else if ("d:Title".equals(qName)) {
	titlePending = false;
	} else if ("d:Description".equals(qName)) {
	descPending = false;
	}
	}
	}

	/**
	* When parsing begins
	*/
	public void startDocument() {
	LOG.info("Begin parse");
	}

	/**
	* When parsing ends
	*/
	public void endDocument() {
	LOG.info("Completed parse. Found " + pages + " pages.");
	}

	/**
	* From time to time the Parser will set the "current location" by calling
	* this function. It's useful for emitting locations for error messages.
	*/
	public void setDocumentLocator(Locator locator) {
	location = locator;
	}

	//
	// Interface ErrorHandler
	//

	/**
	* Emit the exception message
	*/
	public void error(SAXParseException spe) {
	if (LOG.isErrorEnabled()) {
	LOG.error("Error: " + spe.toString() + ": " + spe.getMessage());
	}
	}

	/**
	* Emit exception warning message
	*/
	public void warning(SAXParseException spe) {
	if (LOG.isWarnEnabled()) {
	LOG.warn("Warning: " + spe.toString() + ": " + spe.getMessage());
	}
	}
	}

	/**
	* Iterate through all the items in this structured DMOZ file. Add each URL to
	* the web db.
	*/
	public void parseDmozFile(File dmozFile, int subsetDenom,
	boolean includeAdult, int skew, Pattern topicPattern)
	throws IOException, SAXException, ParserConfigurationException {

	SAXParserFactory parserFactory = SAXParserFactory.newInstance();
	SAXParser parser = parserFactory.newSAXParser();
	XMLReader reader = parser.getXMLReader();

	// Create our own processor to receive SAX events
	RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
	topicPattern);
	reader.setContentHandler(rp);
	reader.setErrorHandler(rp);
	LOG.info("skew = " + rp.hashSkew);

	//
	// Open filtered text stream. The TextFilter makes sure that
	// only appropriate XML-approved Text characters are received.
	// Any non-conforming characters are silently skipped.
	//
	try (XMLCharFilter in = new XMLCharFilter(new BufferedReader(
	new InputStreamReader(new BufferedInputStream(new FileInputStream(
	dmozFile)), "UTF-8")))) {
	InputSource is = new InputSource(in);
	reader.parse(is);
	} catch (Exception e) {
	if (LOG.isErrorEnabled()) {
	LOG.error(e.toString());
	}
	System.exit(0);
	}
	}

	private static void addTopicsFromFile(String topicFile, Vector<String> topics)
	throws IOException {
	try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(
	topicFile), "UTF-8"))) {
	String line = null;
	while ((line = in.readLine()) != null) {
	topics.addElement(line);
	}
	} catch (Exception e) {
	if (LOG.isErrorEnabled()) {
	LOG.error(e.toString());
	}
	System.exit(0);
	}
	}

	/**
	* Command-line access. User may add URLs via a flat text file or the
	* structured DMOZ file. By default, we ignore Adult material (as categorized
	* by DMOZ).
	*/
	public static void main(String[] argv) throws Exception {
	if (argv.length < 1) {
	System.err
	.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
	return;
	}

	//
	// Parse the command line, figure out what kind of
	// URL file we need to load
	//
	int subsetDenom = 1;
	int skew = 0;
	String dmozFile = argv[0];
	boolean includeAdult = false;
	Pattern topicPattern = null;
	Vector<String> topics = new Vector<>();

	Configuration conf = NutchConfiguration.create();
	try (FileSystem fs = FileSystem.get(conf)) {
	for (int i = 1; i < argv.length; i++) {
	if ("-includeAdultMaterial".equals(argv[i])) {
	includeAdult = true;
	} else if ("-subset".equals(argv[i])) {
	subsetDenom = Integer.parseInt(argv[i + 1]);
	i++;
	} else if ("-topic".equals(argv[i])) {
	topics.addElement(argv[i + 1]);
	i++;
	} else if ("-topicFile".equals(argv[i])) {
	addTopicsFromFile(argv[i + 1], topics);
	i++;
	} else if ("-skew".equals(argv[i])) {
	skew = Integer.parseInt(argv[i + 1]);
	i++;
	}
	}

	DmozParser parser = new DmozParser();

	if (!topics.isEmpty()) {
	String regExp = "^(";
	int j = 0;
	for (; j < topics.size() - 1; ++j) {
	regExp = regExp.concat(topics.get(j));
	regExp = regExp.concat("\|");
	}
	regExp = regExp.concat(topics.get(j));
	regExp = regExp.concat(").*");
	LOG.info("Topic selection pattern = " + regExp);
	topicPattern = Pattern.compile(regExp);
	}

	parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
	topicPattern);

	}
	}

	}