src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.creativecommons.nutch;

 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.CreativeCommons;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.*;
 import org.xml.sax.InputSource;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import java.io.StringReader;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;

 /** Adds metadata identifying the Creative Commons license used, if any. */
 public class CCParseFilter implements ParseFilter {
   public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);

   /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
   public static class Walker {
     private URL base; // base url of page
     private String rdfLicense; // subject url found, if any
     private URL relLicense; // license url found, if any
     private URL anchorLicense; // anchor url found, if any
     private String workType; // work type URI

     private Walker(URL base) {
       this.base = base;
     }

     /** Scan the document adding attributes to metadata. */
     public static void walk(Node doc, URL base, WebPage page, Configuration conf)
         throws ParseException {

       // walk the DOM tree, scanning for license data
       Walker walker = new Walker(base);
       walker.walk(doc);

       // interpret results of walk
       String licenseUrl = null;
       String licenseLocation = null;
       if (walker.rdfLicense != null) { // 1st choice: subject in RDF
         licenseLocation = "rdf";
         licenseUrl = walker.rdfLicense;
       } else if (walker.relLicense != null) { // 2nd: anchor w/
         // rel=license
         licenseLocation = "rel";
         licenseUrl = walker.relLicense.toString();
       } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
         // license
         licenseLocation = "a";
         licenseUrl = walker.anchorLicense.toString();
       } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
         throw new ParseException("No CC license.  Excluding.");
       }

       // add license to metadata
       if (licenseUrl != null) {
         if (LOG.isDebugEnabled()) {
           LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation
               + " of " + base);
         }
         page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
             ByteBuffer.wrap(licenseUrl.getBytes()));
         page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
             ByteBuffer.wrap(licenseLocation.getBytes()));
       }

       if (walker.workType != null) {
         if (LOG.isDebugEnabled()) {
           LOG.debug("CC: found " + walker.workType + " in " + base);
         }
         page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
             ByteBuffer.wrap(walker.workType.getBytes()));
       }

     }

     /** Scan the document looking for RDF in comments and license elements. */
     private void walk(Node node) {
       // check element nodes for license URL
       if (node instanceof Element) {
         findLicenseUrl((Element) node);
       }

       // check comment nodes for license RDF
       if (node instanceof Comment) {
         findRdf(((Comment) node).getData());
       }

       // recursively walk child nodes
       NodeList children = node.getChildNodes();
       for (int i = 0; children != null && i < children.getLength(); i++) {
         walk(children.item(i));
       }
     }

     /**
      * Extract license url from element, if any. Thse are the href attribute of
      * anchor elements with rel="license". These must also point to
      * http://creativecommons.org/licenses/.
      */
     private void findLicenseUrl(Element element) {
       // only look in Anchor elements
       if (!"a".equalsIgnoreCase(element.getTagName()))
         return;

       // require an href
       String href = element.getAttribute("href");
       if (href == null)
         return;
       try {
         URL url = new URL(base, href); // resolve the url
         // check that it's a CC license URL
         if ("http".equalsIgnoreCase(url.getProtocol())
             && "creativecommons.org".equalsIgnoreCase(url.getHost())
             && url.getPath() != null && url.getPath().startsWith("/licenses/")
             && url.getPath().length() > "/licenses/".length()) {

           // check rel="license"
           String rel = element.getAttribute("rel");
           if (rel != null && "license".equals(rel) && this.relLicense == null) {
             this.relLicense = url; // found rel license
           } else if (this.anchorLicense == null) {
             this.anchorLicense = url; // found anchor license
           }
         }
       } catch (MalformedURLException e) { // ignore malformed urls
       }
     }

     /** Configure a namespace aware XML parser. */
     private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
         .newInstance();

     static {
       FACTORY.setNamespaceAware(true);
     }

     /** Creative Commons' namespace URI. */
     private static final String CC_NS = "http://web.resource.org/cc/";

     /** Dublin Core namespace URI. */
     private static final String DC_NS = "http://purl.org/dc/elements/1.1/";

     /** RDF syntax namespace URI. */
     private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";

     private void findRdf(String comment) {
       // first check for likely RDF in comment
       int rdfPosition = comment.indexOf("RDF");
       if (rdfPosition < 0)
         return; // no RDF, abort
       int nsPosition = comment.indexOf(CC_NS);
       if (nsPosition < 0)
         return; // no RDF, abort
       // try to parse the XML
       Document doc;
       try {
         DocumentBuilder parser = FACTORY.newDocumentBuilder();
         doc = parser.parse(new InputSource(new StringReader(comment)));
       } catch (Exception e) {
         if (LOG.isWarnEnabled()) {
           LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
         }
         // e.printStackTrace();
         return;
       }

       // check that root is rdf:RDF
       NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
       if (roots.getLength() != 1) {
         if (LOG.isWarnEnabled()) {
           LOG.warn("CC: No RDF root in " + base);
         }
         return;
       }
       Element rdf = (Element) roots.item(0);

       // get cc:License nodes inside rdf:RDF
       NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
       for (int i = 0; i < licenses.getLength(); i++) {
         Element l = (Element) licenses.item(i);
         // license is rdf:about= attribute from cc:License
         this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();

         // walk predicates of cc:License
         NodeList predicates = l.getChildNodes();
         for (int j = 0; j < predicates.getLength(); j++) {
           Node predicateNode = predicates.item(j);
           if (!(predicateNode instanceof Element))
             continue;
           Element predicateElement = (Element) predicateNode;
           // extract predicates of cc:xxx predicates
           if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
             continue;
           }
           String predicate = predicateElement.getLocalName();
           // object is rdf:resource from cc:xxx predicates
           String object = predicateElement.getAttributeNodeNS(RDF_NS,
               "resource").getValue();
           // add object and predicate to metadata
           // metadata.put(object, predicate);
           // if (LOG.isInfoEnabled()) {
           // LOG.info("CC: found: "+predicate+"="+object);
           // }
         }
       }

       // get cc:Work nodes from rdf:RDF
       NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
       for (int i = 0; i < works.getLength(); i++) {
         Element l = (Element) works.item(i);

         // get dc:type nodes from cc:Work
         NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
         for (int j = 0; j < types.getLength(); j++) {
           Element type = (Element) types.item(j);
           String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
               .getValue();
           this.workType = (String) WORK_TYPE_NAMES.get(workUri);
           break;
         }
       }
     }
   }

   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

   static {
     FIELDS.add(WebPage.Field.BASE_URL);
     FIELDS.add(WebPage.Field.METADATA);
   }

   private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();

   static {
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
         "interactive");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
   }

   private Configuration conf;

   public void setConf(Configuration conf) {
     this.conf = conf;
   }

   public Configuration getConf() {
     return this.conf;
   }

   @Override
   public Collection<Field> getFields() {
     return FIELDS;
   }

   /**
    * Adds metadata or otherwise modifies a parse of an HTML document, given the
    * DOM tree of a page.
    */
   @Override
   public Parse filter(String url, WebPage page, Parse parse,
       HTMLMetaTags metaTags, DocumentFragment doc) {
     // construct base url
     URL base;
     try {
       base = new URL(page.getBaseUrl().toString());
       // extract license metadata
       Walker.walk(doc, base, page, getConf());
     } catch (Exception e) {
       LOG.error("Error parsing " + url, e);
       return ParseStatusUtils.getEmptyParse(e, getConf());
     }

     return parse;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.creativecommons.nutch;

	import org.apache.avro.util.Utf8;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.metadata.CreativeCommons;
	import org.apache.nutch.parse.*;
	import org.apache.nutch.storage.WebPage;
	import org.apache.nutch.storage.WebPage.Field;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.*;
	import org.xml.sax.InputSource;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import java.io.StringReader;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.nio.ByteBuffer;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.HashSet;

	/** Adds metadata identifying the Creative Commons license used, if any. */
	public class CCParseFilter implements ParseFilter {
	public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);

	/** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
	public static class Walker {
	private URL base; // base url of page
	private String rdfLicense; // subject url found, if any
	private URL relLicense; // license url found, if any
	private URL anchorLicense; // anchor url found, if any
	private String workType; // work type URI

	private Walker(URL base) {
	this.base = base;
	}

	/** Scan the document adding attributes to metadata. */
	public static void walk(Node doc, URL base, WebPage page, Configuration conf)
	throws ParseException {

	// walk the DOM tree, scanning for license data
	Walker walker = new Walker(base);
	walker.walk(doc);

	// interpret results of walk
	String licenseUrl = null;
	String licenseLocation = null;
	if (walker.rdfLicense != null) { // 1st choice: subject in RDF
	licenseLocation = "rdf";
	licenseUrl = walker.rdfLicense;
	} else if (walker.relLicense != null) { // 2nd: anchor w/
	// rel=license
	licenseLocation = "rel";
	licenseUrl = walker.relLicense.toString();
	} else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
	// license
	licenseLocation = "a";
	licenseUrl = walker.anchorLicense.toString();
	} else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
	throw new ParseException("No CC license. Excluding.");
	}

	// add license to metadata
	if (licenseUrl != null) {
	if (LOG.isDebugEnabled()) {
	LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation
	+ " of " + base);
	}
	page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
	ByteBuffer.wrap(licenseUrl.getBytes()));
	page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
	ByteBuffer.wrap(licenseLocation.getBytes()));
	}

	if (walker.workType != null) {
	if (LOG.isDebugEnabled()) {
	LOG.debug("CC: found " + walker.workType + " in " + base);
	}
	page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
	ByteBuffer.wrap(walker.workType.getBytes()));
	}

	}

	/** Scan the document looking for RDF in comments and license elements. */
	private void walk(Node node) {
	// check element nodes for license URL
	if (node instanceof Element) {
	findLicenseUrl((Element) node);
	}

	// check comment nodes for license RDF
	if (node instanceof Comment) {
	findRdf(((Comment) node).getData());
	}

	// recursively walk child nodes
	NodeList children = node.getChildNodes();
	for (int i = 0; children != null && i < children.getLength(); i++) {
	walk(children.item(i));
	}
	}

	/**
	* Extract license url from element, if any. Thse are the href attribute of
	* anchor elements with rel="license". These must also point to
	* http://creativecommons.org/licenses/.
	*/
	private void findLicenseUrl(Element element) {
	// only look in Anchor elements
	if (!"a".equalsIgnoreCase(element.getTagName()))
	return;

	// require an href
	String href = element.getAttribute("href");
	if (href == null)
	return;
	try {
	URL url = new URL(base, href); // resolve the url
	// check that it's a CC license URL
	if ("http".equalsIgnoreCase(url.getProtocol())
	&& "creativecommons.org".equalsIgnoreCase(url.getHost())
	&& url.getPath() != null && url.getPath().startsWith("/licenses/")
	&& url.getPath().length() > "/licenses/".length()) {

	// check rel="license"
	String rel = element.getAttribute("rel");
	if (rel != null && "license".equals(rel) && this.relLicense == null) {
	this.relLicense = url; // found rel license
	} else if (this.anchorLicense == null) {
	this.anchorLicense = url; // found anchor license
	}
	}
	} catch (MalformedURLException e) { // ignore malformed urls
	}
	}

	/** Configure a namespace aware XML parser. */
	private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
	.newInstance();

	static {
	FACTORY.setNamespaceAware(true);
	}

	/** Creative Commons' namespace URI. */
	private static final String CC_NS = "http://web.resource.org/cc/";

	/** Dublin Core namespace URI. */
	private static final String DC_NS = "http://purl.org/dc/elements/1.1/";

	/** RDF syntax namespace URI. */
	private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";

	private void findRdf(String comment) {
	// first check for likely RDF in comment
	int rdfPosition = comment.indexOf("RDF");
	if (rdfPosition < 0)
	return; // no RDF, abort
	int nsPosition = comment.indexOf(CC_NS);
	if (nsPosition < 0)
	return; // no RDF, abort
	// try to parse the XML
	Document doc;
	try {
	DocumentBuilder parser = FACTORY.newDocumentBuilder();
	doc = parser.parse(new InputSource(new StringReader(comment)));
	} catch (Exception e) {
	if (LOG.isWarnEnabled()) {
	LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
	}
	// e.printStackTrace();
	return;
	}

	// check that root is rdf:RDF
	NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
	if (roots.getLength() != 1) {
	if (LOG.isWarnEnabled()) {
	LOG.warn("CC: No RDF root in " + base);
	}
	return;
	}
	Element rdf = (Element) roots.item(0);

	// get cc:License nodes inside rdf:RDF
	NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
	for (int i = 0; i < licenses.getLength(); i++) {
	Element l = (Element) licenses.item(i);
	// license is rdf:about= attribute from cc:License
	this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();

	// walk predicates of cc:License
	NodeList predicates = l.getChildNodes();
	for (int j = 0; j < predicates.getLength(); j++) {
	Node predicateNode = predicates.item(j);
	if (!(predicateNode instanceof Element))
	continue;
	Element predicateElement = (Element) predicateNode;
	// extract predicates of cc:xxx predicates
	if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
	continue;
	}
	String predicate = predicateElement.getLocalName();
	// object is rdf:resource from cc:xxx predicates
	String object = predicateElement.getAttributeNodeNS(RDF_NS,
	"resource").getValue();
	// add object and predicate to metadata
	// metadata.put(object, predicate);
	// if (LOG.isInfoEnabled()) {
	// LOG.info("CC: found: "+predicate+"="+object);
	// }
	}
	}

	// get cc:Work nodes from rdf:RDF
	NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
	for (int i = 0; i < works.getLength(); i++) {
	Element l = (Element) works.item(i);

	// get dc:type nodes from cc:Work
	NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
	for (int j = 0; j < types.getLength(); j++) {
	Element type = (Element) types.item(j);
	String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
	.getValue();
	this.workType = (String) WORK_TYPE_NAMES.get(workUri);
	break;
	}
	}
	}
	}

	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

	static {
	FIELDS.add(WebPage.Field.BASE_URL);
	FIELDS.add(WebPage.Field.METADATA);
	}

	private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();

	static {
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
	"interactive");
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
	WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
	}

	private Configuration conf;

	public void setConf(Configuration conf) {
	this.conf = conf;
	}

	public Configuration getConf() {
	return this.conf;
	}

	@Override
	public Collection<Field> getFields() {
	return FIELDS;
	}

	/**
	* Adds metadata or otherwise modifies a parse of an HTML document, given the
	* DOM tree of a page.
	*/
	@Override
	public Parse filter(String url, WebPage page, Parse parse,
	HTMLMetaTags metaTags, DocumentFragment doc) {
	// construct base url
	URL base;
	try {
	base = new URL(page.getBaseUrl().toString());
	// extract license metadata
	Walker.walk(doc, base, page, getConf());
	} catch (Exception e) {
	LOG.error("Error parsing " + url, e);
	return ParseStatusUtils.getEmptyParse(e, getConf());
	}

	return parse;
	}
	}