core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.TagSoupExtractionResult;
 import org.apache.any23.extractor.html.annotations.Includes;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.eclipse.rdf4j.model.BNode;
 import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;

 import java.io.IOException;
 import java.util.Locale;

 /**
  * The abstract base class for any <a href="microformats.org/">Microformat specification</a> extractor.
  */
 public abstract class MicroformatExtractor implements TagSoupDOMExtractor {

     public static final String BEGIN_SCRIPT = "<script>";
     public static final String END_SCRIPT = "</script>";

     private HTMLDocument htmlDocument;

     private ExtractionContext context;

     private IRI documentIRI;

     private ExtractionResult out;

     protected final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
             SimpleValueFactory.getInstance());

     /**
      * Returns the description of this extractor.
      *
      * @return a human readable description.
      */
     public abstract ExtractorDescription getDescription();

     /**
      * Performs the extraction of the data and writes them to the model. The nodes generated in the model can have any
      * name or implicit label but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that are uniquely
      * derivable from their position in the DOM tree, so that multiple extractors can merge information.
      *
      * @return true if extraction is successful
      *
      * @throws ExtractionException
      *             if there is an error during extraction
      */
     protected abstract boolean extract() throws ExtractionException;

     public HTMLDocument getHTMLDocument() {
         return htmlDocument;
     }

     public ExtractionContext getExtractionContext() {
         return context;
     }

     public IRI getDocumentIRI() {
         return documentIRI;
     }

     public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
             ExtractionResult out) throws IOException, ExtractionException {
         this.htmlDocument = new HTMLDocument(in);
         this.context = extractionContext;
         this.documentIRI = extractionContext.getDocumentIRI();
         this.out = out;
         valueFactory.setIssueReport(out);
         try {
             extract();
         } finally {
             valueFactory.setIssueReport(null);
         }
     }

     /**
      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated to the extraction session.
      *
      * @return a valid extraction result.
      */
     protected ExtractionResult getCurrentExtractionResult() {
         return out;
     }

     protected void setCurrentExtractionResult(ExtractionResult out) {
         this.out = out;
     }

     protected ExtractionResult openSubResult(ExtractionContext context) {
         return out.openSubResult(context);
     }

     /**
      * Helper method that adds a literal property to a subject only if the value of the property is a valid string.
      *
      * @param n
      *            the <i>HTML</i> node from which the property value has been extracted.
      * @param subject
      *            the property subject.
      * @param p
      *            the property IRI.
      * @param value
      *            the property value.
      *
      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddStringProperty(Node n, Resource subject, IRI p, String value) {
         if (value == null)
             return false;
         value = value.trim();
         return value.length() > 0 && conditionallyAddLiteralProperty(n, subject, p, valueFactory.createLiteral(value));
     }

     /**
      * Helper method that adds a literal property to a node.
      *
      * @param n
      *            the <i>HTML</i> node from which the property value has been extracted.
      * @param subject
      *            subject the property subject.
      * @param property
      *            the property IRI.
      * @param literal
      *            value the property value.
      *
      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddLiteralProperty(Node n, Resource subject, IRI property, Literal literal) {
         final String literalStr = literal.stringValue();
         if (containsScriptBlock(literalStr)) {
             out.notifyIssue(IssueReport.IssueLevel.WARNING,
                     String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr), -1, -1);
             return false;
         }
         out.writeTriple(subject, property, literal);
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n));
         return true;
     }

     /**
      * Helper method that adds a IRI property to a node.
      *
      * @param subject
      *            the property subject.
      * @param property
      *            the property IRI.
      * @param uri
      *            the property object.
      *
      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
         if (uri == null)
             return false;
         out.writeTriple(subject, property, uri);
         return true;
     }

     /**
      * Helper method that adds a BNode property to a node.
      *
      * @param n
      *            the <i>HTML</i> node used for extracting such property.
      * @param subject
      *            the property subject.
      * @param property
      *            the property IRI.
      * @param bnode
      *            the property value.
      */
     protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
         out.writeTriple(subject, property, bnode);
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n));
     }

     /**
      * Helper method that adds a BNode property to a node.
      *
      * @param subject
      *            the property subject.
      * @param property
      *            the property IRI.
      * @param bnode
      *            the property value.
      */
     protected void addBNodeProperty(Resource subject, IRI property, BNode bnode) {
         out.writeTriple(subject, property, bnode);
     }

     /**
      * Helper method that adds a IRI property to a node.
      *
      * @param subject
      *            subject to add
      * @param property
      *            predicate to add
      * @param object
      *            object to add
      */
     protected void addIRIProperty(Resource subject, IRI property, IRI object) {
         out.writeTriple(subject, property, object);
     }

     protected IRI fixLink(String link) {
         return valueFactory.fixLink(link, null);
     }

     protected IRI fixLink(String link, String defaultSchema) {
         return valueFactory.fixLink(link, defaultSchema);
     }

     private boolean containsScriptBlock(String in) {
         final String inLowerCase = in.toLowerCase(Locale.ROOT);
         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
         if (beginBlock == -1) {
             return false;
         }
         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
     }

     /**
      * This method checks if there is a native nesting relationship between two {@link MicroformatExtractor}.
      *
      * @see org.apache.any23.extractor.html.annotations.Includes
      *
      * @param including
      *            the including {@link MicroformatExtractor}
      * @param included
      *            the included {@link MicroformatExtractor}
      *
      * @return <code>true</code> if there is a declared nesting relationship
      */
     public static boolean includes(Class<? extends MicroformatExtractor> including,
             Class<? extends MicroformatExtractor> included) {
         Includes includes = including.getAnnotation(Includes.class);
         if (includes != null) {
             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
             if (extractors != null && extractors.length > 0) {
                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
                     if (extractor.equals(included)) {
                         return true;
                     }
                 }
             }
         }
         return false;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.apache.any23.extractor.ExtractionContext;
	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.extractor.ExtractionParameters;
	import org.apache.any23.extractor.ExtractionResult;
	import org.apache.any23.extractor.ExtractorDescription;
	import org.apache.any23.extractor.IssueReport;
	import org.apache.any23.extractor.TagSoupExtractionResult;
	import org.apache.any23.extractor.html.annotations.Includes;
	import org.apache.any23.rdf.Any23ValueFactoryWrapper;
	import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
	import org.eclipse.rdf4j.model.BNode;
	import org.eclipse.rdf4j.model.Literal;
	import org.eclipse.rdf4j.model.Resource;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
	import org.w3c.dom.Document;
	import org.w3c.dom.Node;

	import java.io.IOException;
	import java.util.Locale;

	/**
	* The abstract base class for any <a href="microformats.org/">Microformat specification</a> extractor.
	*/
	public abstract class MicroformatExtractor implements TagSoupDOMExtractor {

	public static final String BEGIN_SCRIPT = "<script>";
	public static final String END_SCRIPT = "</script>";

	private HTMLDocument htmlDocument;

	private ExtractionContext context;

	private IRI documentIRI;

	private ExtractionResult out;

	protected final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
	SimpleValueFactory.getInstance());

	/**
	* Returns the description of this extractor.
	*
	* @return a human readable description.
	*/
	public abstract ExtractorDescription getDescription();

	/**
	* Performs the extraction of the data and writes them to the model. The nodes generated in the model can have any
	* name or implicit label but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that are uniquely
	* derivable from their position in the DOM tree, so that multiple extractors can merge information.
	*
	* @return true if extraction is successful
	*
	* @throws ExtractionException
	* if there is an error during extraction
	*/
	protected abstract boolean extract() throws ExtractionException;

	public HTMLDocument getHTMLDocument() {
	return htmlDocument;
	}

	public ExtractionContext getExtractionContext() {
	return context;
	}

	public IRI getDocumentIRI() {
	return documentIRI;
	}

	public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
	ExtractionResult out) throws IOException, ExtractionException {
	this.htmlDocument = new HTMLDocument(in);
	this.context = extractionContext;
	this.documentIRI = extractionContext.getDocumentIRI();
	this.out = out;
	valueFactory.setIssueReport(out);
	try {
	extract();
	} finally {
	valueFactory.setIssueReport(null);
	}
	}

	/**
	* Returns the {@link org.apache.any23.extractor.ExtractionResult} associated to the extraction session.
	*
	* @return a valid extraction result.
	*/
	protected ExtractionResult getCurrentExtractionResult() {
	return out;
	}

	protected void setCurrentExtractionResult(ExtractionResult out) {
	this.out = out;
	}

	protected ExtractionResult openSubResult(ExtractionContext context) {
	return out.openSubResult(context);
	}

	/**
	* Helper method that adds a literal property to a subject only if the value of the property is a valid string.
	*
	* @param n
	* the <i>HTML</i> node from which the property value has been extracted.
	* @param subject
	* the property subject.
	* @param p
	* the property IRI.
	* @param value
	* the property value.
	*
	* @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
	*/
	protected boolean conditionallyAddStringProperty(Node n, Resource subject, IRI p, String value) {
	if (value == null)
	return false;
	value = value.trim();
	return value.length() > 0 && conditionallyAddLiteralProperty(n, subject, p, valueFactory.createLiteral(value));
	}

	/**
	* Helper method that adds a literal property to a node.
	*
	* @param n
	* the <i>HTML</i> node from which the property value has been extracted.
	* @param subject
	* subject the property subject.
	* @param property
	* the property IRI.
	* @param literal
	* value the property value.
	*
	* @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
	*/
	protected boolean conditionallyAddLiteralProperty(Node n, Resource subject, IRI property, Literal literal) {
	final String literalStr = literal.stringValue();
	if (containsScriptBlock(literalStr)) {
	out.notifyIssue(IssueReport.IssueLevel.WARNING,
	String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr), -1, -1);
	return false;
	}
	out.writeTriple(subject, property, literal);
	TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
	tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n));
	return true;
	}

	/**
	* Helper method that adds a IRI property to a node.
	*
	* @param subject
	* the property subject.
	* @param property
	* the property IRI.
	* @param uri
	* the property object.
	*
	* @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
	*/
	protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
	if (uri == null)
	return false;
	out.writeTriple(subject, property, uri);
	return true;
	}

	/**
	* Helper method that adds a BNode property to a node.
	*
	* @param n
	* the <i>HTML</i> node used for extracting such property.
	* @param subject
	* the property subject.
	* @param property
	* the property IRI.
	* @param bnode
	* the property value.
	*/
	protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
	out.writeTriple(subject, property, bnode);
	TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
	tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n));
	}

	/**
	* Helper method that adds a BNode property to a node.
	*
	* @param subject
	* the property subject.
	* @param property
	* the property IRI.
	* @param bnode
	* the property value.
	*/
	protected void addBNodeProperty(Resource subject, IRI property, BNode bnode) {
	out.writeTriple(subject, property, bnode);
	}

	/**
	* Helper method that adds a IRI property to a node.
	*
	* @param subject
	* subject to add
	* @param property
	* predicate to add
	* @param object
	* object to add
	*/
	protected void addIRIProperty(Resource subject, IRI property, IRI object) {
	out.writeTriple(subject, property, object);
	}

	protected IRI fixLink(String link) {
	return valueFactory.fixLink(link, null);
	}

	protected IRI fixLink(String link, String defaultSchema) {
	return valueFactory.fixLink(link, defaultSchema);
	}

	private boolean containsScriptBlock(String in) {
	final String inLowerCase = in.toLowerCase(Locale.ROOT);
	final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
	if (beginBlock == -1) {
	return false;
	}
	return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
	}

	/**
	* This method checks if there is a native nesting relationship between two {@link MicroformatExtractor}.
	*
	* @see org.apache.any23.extractor.html.annotations.Includes
	*
	* @param including
	* the including {@link MicroformatExtractor}
	* @param included
	* the included {@link MicroformatExtractor}
	*
	* @return <code>true</code> if there is a declared nesting relationship
	*/
	public static boolean includes(Class<? extends MicroformatExtractor> including,
	Class<? extends MicroformatExtractor> included) {
	Includes includes = including.getAnnotation(Includes.class);
	if (includes != null) {
	Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
	if (extractors != null && extractors.length > 0) {
	for (Class<? extends MicroformatExtractor> extractor : extractors) {
	if (extractor.equals(included)) {
	return true;
	}
	}
	}
	}
	return false;
	}

	}