| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.creativecommons.nutch; |
| |
| import org.apache.nutch.metadata.CreativeCommons; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.metadata.Metadata; |
| import org.apache.nutch.parse.HTMLMetaTags; |
| import org.apache.nutch.parse.HtmlParseFilter; |
| import org.apache.nutch.parse.Parse; |
| import org.apache.nutch.parse.ParseException; |
| import org.apache.nutch.parse.ParseResult; |
| import org.apache.nutch.parse.ParseStatus; |
| import org.apache.nutch.parse.ParseText; |
| import org.apache.hadoop.conf.Configuration; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.w3c.dom.Comment; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.DocumentFragment; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| |
| import java.io.StringReader; |
| import java.lang.invoke.MethodHandles; |
| import java.net.MalformedURLException; |
| import java.net.URL; |
| import java.util.HashMap; |
| |
| import javax.xml.parsers.DocumentBuilder; |
| import javax.xml.parsers.DocumentBuilderFactory; |
| |
| import org.xml.sax.InputSource; |
| |
| /** Adds metadata identifying the Creative Commons license used, if any. */ |
| public class CCParseFilter implements HtmlParseFilter { |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */ |
| public static class Walker { |
| private URL base; // base url of page |
| private String rdfLicense; // subject url found, if any |
| private URL relLicense; // license url found, if any |
| private URL anchorLicense; // anchor url found, if any |
| private String workType; // work type URI |
| |
| private Walker(URL base) { |
| this.base = base; |
| } |
| |
| /** |
| * Scan the document adding attributes to metadata. |
| * @param doc the {@link org.w3c.dom.Node} to walk and process |
| * @param base canonical url |
| * @param metadata url {@link org.apache.nutch.metadata.Metadata} |
| * @param conf a populated {@link org.apache.hadoop.conf.Configuration} |
| * @throws ParseException if there is a fatal error or if |
| * <code>creativecommons.exclude.unlicensed</code> is set to true |
| */ |
| public static void walk(Node doc, URL base, Metadata metadata, |
| Configuration conf) throws ParseException { |
| |
| // walk the DOM tree, scanning for license data |
| Walker walker = new Walker(base); |
| walker.walk(doc); |
| |
| // interpret results of walk |
| String licenseUrl = null; |
| String licenseLocation = null; |
| if (walker.rdfLicense != null) { // 1st choice: subject in RDF |
| licenseLocation = "rdf"; |
| licenseUrl = walker.rdfLicense; |
| } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license |
| licenseLocation = "rel"; |
| licenseUrl = walker.relLicense.toString(); |
| } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license |
| licenseLocation = "a"; |
| licenseUrl = walker.anchorLicense.toString(); |
| } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) { |
| throw new ParseException("No CC license. Excluding."); |
| } |
| |
| // add license to metadata |
| if (licenseUrl != null) { |
| if (LOG.isInfoEnabled()) { |
| LOG.info("CC: found " + licenseUrl + " in " + licenseLocation |
| + " of " + base); |
| } |
| metadata.add(CreativeCommons.LICENSE_URL, licenseUrl); |
| metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation); |
| } |
| |
| if (walker.workType != null) { |
| if (LOG.isInfoEnabled()) { |
| LOG.info("CC: found " + walker.workType + " in " + base); |
| } |
| metadata.add(CreativeCommons.WORK_TYPE, walker.workType); |
| } |
| |
| } |
| |
| /** Scan the document looking for RDF in comments and license elements. */ |
| private void walk(Node node) { |
| |
| // check element nodes for license URL |
| if (node instanceof Element) { |
| findLicenseUrl((Element) node); |
| } |
| |
| // check comment nodes for license RDF |
| if (node instanceof Comment) { |
| findRdf(((Comment) node).getData()); |
| } |
| |
| // recursively walk child nodes |
| NodeList children = node.getChildNodes(); |
| for (int i = 0; children != null && i < children.getLength(); i++) { |
| walk(children.item(i)); |
| } |
| } |
| |
| /** |
| * Extract license url from element, if any. Thse are the href attribute of |
| * anchor elements with rel="license". These must also point to |
| * http://creativecommons.org/licenses/. |
| */ |
| private void findLicenseUrl(Element element) { |
| // only look in Anchor elements |
| if (!"a".equalsIgnoreCase(element.getTagName())) |
| return; |
| |
| // require an href |
| String href = element.getAttribute("href"); |
| if (href == null) |
| return; |
| |
| try { |
| URL url = new URL(base, href); // resolve the url |
| |
| // check that it's a CC license URL |
| if ("http".equalsIgnoreCase(url.getProtocol()) |
| && "creativecommons.org".equalsIgnoreCase(url.getHost()) |
| && url.getPath() != null && url.getPath().startsWith("/licenses/") |
| && url.getPath().length() > "/licenses/".length()) { |
| |
| // check rel="license" |
| String rel = element.getAttribute("rel"); |
| if (rel != null && "license".equals(rel) && this.relLicense == null) { |
| this.relLicense = url; // found rel license |
| } else if (this.anchorLicense == null) { |
| this.anchorLicense = url; // found anchor license |
| } |
| } |
| } catch (MalformedURLException e) { // ignore malformed urls |
| } |
| } |
| |
| /** Configure a namespace aware XML parser. */ |
| private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory |
| .newInstance(); |
| static { |
| FACTORY.setNamespaceAware(true); |
| } |
| |
| /** Creative Commons' namespace URI. */ |
| private static final String CC_NS = "http://web.resource.org/cc/"; |
| |
| /** Dublin Core namespace URI. */ |
| private static final String DC_NS = "http://purl.org/dc/elements/1.1/"; |
| |
| /** RDF syntax namespace URI. */ |
| private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; |
| |
| private void findRdf(String comment) { |
| // first check for likely RDF in comment |
| int rdfPosition = comment.indexOf("RDF"); |
| if (rdfPosition < 0) |
| return; // no RDF, abort |
| int nsPosition = comment.indexOf(CC_NS); |
| if (nsPosition < 0) |
| return; // no RDF, abort |
| |
| // try to parse the XML |
| Document doc; |
| try { |
| DocumentBuilder parser = FACTORY.newDocumentBuilder(); |
| doc = parser.parse(new InputSource(new StringReader(comment))); |
| } catch (Exception e) { |
| if (LOG.isWarnEnabled()) { |
| LOG.warn("CC: Failed to parse RDF in " + base + ": " + e); |
| } |
| return; |
| } |
| |
| // check that root is rdf:RDF |
| NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); |
| if (roots.getLength() != 1) { |
| if (LOG.isWarnEnabled()) { |
| LOG.warn("CC: No RDF root in " + base); |
| } |
| return; |
| } |
| Element rdf = (Element) roots.item(0); |
| |
| // get cc:License nodes inside rdf:RDF |
| NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); |
| for (int i = 0; i < licenses.getLength(); i++) { |
| |
| Element l = (Element) licenses.item(i); |
| |
| // license is rdf:about= attribute from cc:License |
| this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue(); |
| |
| // walk predicates of cc:License |
| NodeList predicates = l.getChildNodes(); |
| for (int j = 0; j < predicates.getLength(); j++) { |
| Node predicateNode = predicates.item(j); |
| if (!(predicateNode instanceof Element)) |
| continue; |
| Element predicateElement = (Element) predicateNode; |
| |
| // extract predicates of cc:xxx predicates |
| if (!CC_NS.equals(predicateElement.getNamespaceURI())) { |
| continue; |
| } |
| } |
| } |
| |
| // get cc:Work nodes from rdf:RDF |
| NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work"); |
| for (int i = 0; i < works.getLength(); i++) { |
| // get dc:type nodes from cc:Work |
| NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); |
| |
| for (int j = 0; j < types.getLength(); j++) { |
| Element type = (Element) types.item(j); |
| String workUri = type.getAttributeNodeNS(RDF_NS, "resource") |
| .getValue(); |
| this.workType = WORK_TYPE_NAMES.get(workUri); |
| } |
| } |
| } |
| } |
| |
| private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<>(); |
| static { |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video"); |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", |
| "interactive"); |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); |
| WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); |
| } |
| |
| private Configuration conf; |
| |
| /** |
| * Adds metadata or otherwise modifies a parse of an HTML document, given the |
| * DOM tree of a page. |
| */ |
| @Override |
| public ParseResult filter(Content content, ParseResult parseResult, |
| HTMLMetaTags metaTags, DocumentFragment doc) { |
| |
| // get parse obj |
| Parse parse = parseResult.get(content.getUrl()); |
| |
| // construct base url |
| URL base; |
| try { |
| base = new URL(content.getBaseUrl()); |
| } catch (MalformedURLException e) { |
| Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); |
| parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), |
| emptyParse.getData()); |
| return parseResult; |
| } |
| |
| try { |
| // extract license metadata |
| Walker.walk(doc, base, parse.getData().getParseMeta(), getConf()); |
| } catch (ParseException e) { |
| Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf()); |
| parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), |
| emptyParse.getData()); |
| return parseResult; |
| } |
| |
| return parseResult; |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| } |
| |
| @Override |
| public Configuration getConf() { |
| return this.conf; |
| } |
| } |