core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.TagSoupExtractionResult;
 import org.apache.any23.extractor.html.annotations.Includes;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.eclipse.rdf4j.model.BNode;
 import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;

 import java.io.IOException;
 import java.util.Locale;

 /**
  * The abstract base class for any
  * <a href="microformats.org/">Microformat specification</a> extractor.
  */
 public abstract class MicroformatExtractor implements TagSoupDOMExtractor {

     public static final String BEGIN_SCRIPT = "<script>";
     public static final String END_SCRIPT   = "</script>";

     private HTMLDocument htmlDocument;

     private ExtractionContext context;

     private IRI documentIRI;

     private ExtractionResult out;

     protected final Any23ValueFactoryWrapper valueFactory =
             new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance());

     /**
      * Returns the description of this extractor.
      *
      * @return a human readable description.
      */
     public abstract ExtractorDescription getDescription();

     /**
      * Performs the extraction of the data and writes them to the model.
      * The nodes generated in the model can have any name or implicit label
      * but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that
      * are uniquely derivable from their position in the DOM tree, so that
      * multiple extractors can merge information.
      * @return true if extraction is successful
      * @throws ExtractionException if there is an error during extraction
      */
     protected abstract boolean extract() throws ExtractionException;

     public HTMLDocument getHTMLDocument() {
         return htmlDocument;
     }

     public ExtractionContext getExtractionContext() {
         return context;
     }

     public IRI getDocumentIRI() {
         return documentIRI;
     }

     public final void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
         this.htmlDocument = new HTMLDocument(in);
         this.context      = extractionContext;
         this.documentIRI  = extractionContext.getDocumentIRI();
         this.out          = out;
         valueFactory.setIssueReport(out);
         try {
             extract();
         } finally {
             valueFactory.setIssueReport(null);
         }
     }

     /**
      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
      * to the extraction session.
      *
      * @return a valid extraction result.
      */
     protected ExtractionResult getCurrentExtractionResult() {
         return out;
     }

     protected void setCurrentExtractionResult(ExtractionResult out) {
         this.out = out;
     }

     protected ExtractionResult openSubResult(ExtractionContext context) {
         return out.openSubResult(context);
     }

     /**
      * Helper method that adds a literal property to a subject only if the value of the property
      * is a valid string.
      *
      * @param n the <i>HTML</i> node from which the property value has been extracted.
      * @param subject the property subject.
      * @param p the property IRI.
      * @param value the property value.
      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddStringProperty(
             Node n,
             Resource subject, IRI p, String value
     ) {
         if (value == null) return false;
         value = value.trim();
         return
                 value.length() > 0
                         &&
                 conditionallyAddLiteralProperty(
                         n,
                         subject, p, valueFactory.createLiteral(value)
                 );
     }

     /**
      * Helper method that adds a literal property to a node.
      *
      * @param n the <i>HTML</i> node from which the property value has been extracted.
      * @param subject subject the property subject.
      * @param property the property IRI.
      * @param literal value the property value.
      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddLiteralProperty(
             Node n,
             Resource subject,
             IRI property,
             Literal literal
     ) {
         final String literalStr = literal.stringValue();
         if( containsScriptBlock(literalStr) ) {
             out.notifyIssue(
                     IssueReport.IssueLevel.WARNING,
                     String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr)
                     , -1
                     , -1
             );
             return false;
         }
         out.writeTriple(subject, property, literal);
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
         return true;
     }

     /**
      * Helper method that adds a IRI property to a node.
      * @param subject the property subject.
      * @param property the property IRI.
      * @param uri the property object.
      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
         if (uri == null) return false;
         out.writeTriple(subject, property, uri);
         return true;
     }

     /**
      * Helper method that adds a BNode property to a node.
      *
      * @param n the <i>HTML</i> node used for extracting such property.
      * @param subject the property subject.
      * @param property the property IRI.
      * @param bnode the property value.
      */
     protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
         out.writeTriple(subject, property, bnode);
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
     }

     /**
      * Helper method that adds a BNode property to a node.
      *
      * @param subject the property subject.
      * @param property the property IRI.
      * @param bnode the property value.
      */
     protected void addBNodeProperty( Resource subject, IRI property, BNode bnode) {
         out.writeTriple(subject, property, bnode);
     }

     /**
      * Helper method that adds a IRI property to a node.
      *
      * @param subject subject to add
      * @param property predicate to add
      * @param object object to add
      */
     protected void addIRIProperty(Resource subject, IRI property, IRI object) {
         out.writeTriple(subject, property, object);
     }

     protected IRI fixLink(String link) {
         return valueFactory.fixLink(link, null);
     }

     protected IRI fixLink(String link, String defaultSchema) {
         return valueFactory.fixLink(link, defaultSchema);
     }

     private boolean containsScriptBlock(String in) {
         final String inLowerCase = in.toLowerCase(Locale.ROOT);
         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
         if(beginBlock == -1) {
             return false;
         }
         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
     }

         /**
      * This method checks if there is a native nesting relationship between two
      * {@link MicroformatExtractor}.
      *
      * @see org.apache.any23.extractor.html.annotations.Includes
      * @param including the including {@link MicroformatExtractor}
      * @param included the included {@link MicroformatExtractor}
      * @return <code>true</code> if there is a declared nesting relationship
      */
     public static boolean includes(
             Class<? extends MicroformatExtractor>including,
             Class<? extends MicroformatExtractor> included) {
         Includes includes = including.getAnnotation(Includes.class);
         if (includes != null) {
             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
             if (extractors != null && extractors.length > 0) {
                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
                     if (extractor.equals(included)) {
                         return true;
                     }
                 }
             }
         }
         return false;
     }


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.apache.any23.extractor.ExtractionContext;
	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.extractor.ExtractionParameters;
	import org.apache.any23.extractor.ExtractionResult;
	import org.apache.any23.extractor.ExtractorDescription;
	import org.apache.any23.extractor.IssueReport;
	import org.apache.any23.extractor.TagSoupExtractionResult;
	import org.apache.any23.extractor.html.annotations.Includes;
	import org.apache.any23.rdf.Any23ValueFactoryWrapper;
	import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
	import org.eclipse.rdf4j.model.BNode;
	import org.eclipse.rdf4j.model.Literal;
	import org.eclipse.rdf4j.model.Resource;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
	import org.w3c.dom.Document;
	import org.w3c.dom.Node;

	import java.io.IOException;
	import java.util.Locale;

	/**
	* The abstract base class for any
	* <a href="microformats.org/">Microformat specification</a> extractor.
	*/
	public abstract class MicroformatExtractor implements TagSoupDOMExtractor {

	public static final String BEGIN_SCRIPT = "<script>";
	public static final String END_SCRIPT = "</script>";

	private HTMLDocument htmlDocument;

	private ExtractionContext context;

	private IRI documentIRI;

	private ExtractionResult out;

	protected final Any23ValueFactoryWrapper valueFactory =
	new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance());

	/**
	* Returns the description of this extractor.
	*
	* @return a human readable description.
	*/
	public abstract ExtractorDescription getDescription();

	/**
	* Performs the extraction of the data and writes them to the model.
	* The nodes generated in the model can have any name or implicit label
	* but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that
	* are uniquely derivable from their position in the DOM tree, so that
	* multiple extractors can merge information.
	* @return true if extraction is successful
	* @throws ExtractionException if there is an error during extraction
	*/
	protected abstract boolean extract() throws ExtractionException;

	public HTMLDocument getHTMLDocument() {
	return htmlDocument;
	}

	public ExtractionContext getExtractionContext() {
	return context;
	}

	public IRI getDocumentIRI() {
	return documentIRI;
	}

	public final void run(
	ExtractionParameters extractionParameters,
	ExtractionContext extractionContext,
	Document in,
	ExtractionResult out
	) throws IOException, ExtractionException {
	this.htmlDocument = new HTMLDocument(in);
	this.context = extractionContext;
	this.documentIRI = extractionContext.getDocumentIRI();
	this.out = out;
	valueFactory.setIssueReport(out);
	try {
	extract();
	} finally {
	valueFactory.setIssueReport(null);
	}
	}

	/**
	* Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
	* to the extraction session.
	*
	* @return a valid extraction result.
	*/
	protected ExtractionResult getCurrentExtractionResult() {
	return out;
	}

	protected void setCurrentExtractionResult(ExtractionResult out) {
	this.out = out;
	}

	protected ExtractionResult openSubResult(ExtractionContext context) {
	return out.openSubResult(context);
	}

	/**
	* Helper method that adds a literal property to a subject only if the value of the property
	* is a valid string.
	*
	* @param n the <i>HTML</i> node from which the property value has been extracted.
	* @param subject the property subject.
	* @param p the property IRI.
	* @param value the property value.
	* @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
	*/
	protected boolean conditionallyAddStringProperty(
	Node n,
	Resource subject, IRI p, String value
	) {
	if (value == null) return false;
	value = value.trim();
	return
	value.length() > 0
	&&
	conditionallyAddLiteralProperty(
	n,
	subject, p, valueFactory.createLiteral(value)
	);
	}

	/**
	* Helper method that adds a literal property to a node.
	*
	* @param n the <i>HTML</i> node from which the property value has been extracted.
	* @param subject subject the property subject.
	* @param property the property IRI.
	* @param literal value the property value.
	* @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
	*/
	protected boolean conditionallyAddLiteralProperty(
	Node n,
	Resource subject,
	IRI property,
	Literal literal
	) {
	final String literalStr = literal.stringValue();
	if( containsScriptBlock(literalStr) ) {
	out.notifyIssue(
	IssueReport.IssueLevel.WARNING,
	String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr)
	, -1
	, -1
	);
	return false;
	}
	out.writeTriple(subject, property, literal);
	TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
	tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
	return true;
	}

	/**
	* Helper method that adds a IRI property to a node.
	* @param subject the property subject.
	* @param property the property IRI.
	* @param uri the property object.
	* @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
	*/
	protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
	if (uri == null) return false;
	out.writeTriple(subject, property, uri);
	return true;
	}

	/**
	* Helper method that adds a BNode property to a node.
	*
	* @param n the <i>HTML</i> node used for extracting such property.
	* @param subject the property subject.
	* @param property the property IRI.
	* @param bnode the property value.
	*/
	protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
	out.writeTriple(subject, property, bnode);
	TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
	tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
	}

	/**
	* Helper method that adds a BNode property to a node.
	*
	* @param subject the property subject.
	* @param property the property IRI.
	* @param bnode the property value.
	*/
	protected void addBNodeProperty( Resource subject, IRI property, BNode bnode) {
	out.writeTriple(subject, property, bnode);
	}

	/**
	* Helper method that adds a IRI property to a node.
	*
	* @param subject subject to add
	* @param property predicate to add
	* @param object object to add
	*/
	protected void addIRIProperty(Resource subject, IRI property, IRI object) {
	out.writeTriple(subject, property, object);
	}

	protected IRI fixLink(String link) {
	return valueFactory.fixLink(link, null);
	}

	protected IRI fixLink(String link, String defaultSchema) {
	return valueFactory.fixLink(link, defaultSchema);
	}

	private boolean containsScriptBlock(String in) {
	final String inLowerCase = in.toLowerCase(Locale.ROOT);
	final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
	if(beginBlock == -1) {
	return false;
	}
	return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
	}

	/**
	* This method checks if there is a native nesting relationship between two
	* {@link MicroformatExtractor}.
	*
	* @see org.apache.any23.extractor.html.annotations.Includes
	* @param including the including {@link MicroformatExtractor}
	* @param included the included {@link MicroformatExtractor}
	* @return <code>true</code> if there is a declared nesting relationship
	*/
	public static boolean includes(
	Class<? extends MicroformatExtractor>including,
	Class<? extends MicroformatExtractor> included) {
	Includes includes = including.getAnnotation(Includes.class);
	if (includes != null) {
	Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
	if (extractors != null && extractors.length > 0) {
	for (Class<? extends MicroformatExtractor> extractor : extractors) {
	if (extractor.equals(included)) {
	return true;
	}
	}
	}
	}
	return false;
	}


	}