blob: 3ba9c1f83deb91bfa66ff11e7ec1b97e29c2158b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.TagSoupExtractionResult;
import org.apache.any23.extractor.html.annotations.Includes;
import org.apache.any23.rdf.Any23ValueFactoryWrapper;
import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import java.io.IOException;
import java.util.Locale;
/**
* The abstract base class for any <a href="microformats.org/">Microformat specification</a> extractor.
*/
public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
public static final String BEGIN_SCRIPT = "<script>";
public static final String END_SCRIPT = "</script>";
private HTMLDocument htmlDocument;
private ExtractionContext context;
private IRI documentIRI;
private ExtractionResult out;
protected final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
SimpleValueFactory.getInstance());
/**
* Returns the description of this extractor.
*
* @return a human readable description.
*/
public abstract ExtractorDescription getDescription();
/**
* Performs the extraction of the data and writes them to the model. The nodes generated in the model can have any
* name or implicit label but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that are uniquely
* derivable from their position in the DOM tree, so that multiple extractors can merge information.
*
* @return true if extraction is successful
*
* @throws ExtractionException
* if there is an error during extraction
*/
protected abstract boolean extract() throws ExtractionException;
public HTMLDocument getHTMLDocument() {
return htmlDocument;
}
public ExtractionContext getExtractionContext() {
return context;
}
public IRI getDocumentIRI() {
return documentIRI;
}
public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
ExtractionResult out) throws IOException, ExtractionException {
this.htmlDocument = new HTMLDocument(in);
this.context = extractionContext;
this.documentIRI = extractionContext.getDocumentIRI();
this.out = out;
valueFactory.setIssueReport(out);
try {
extract();
} finally {
valueFactory.setIssueReport(null);
}
}
/**
* Returns the {@link org.apache.any23.extractor.ExtractionResult} associated to the extraction session.
*
* @return a valid extraction result.
*/
protected ExtractionResult getCurrentExtractionResult() {
return out;
}
protected void setCurrentExtractionResult(ExtractionResult out) {
this.out = out;
}
protected ExtractionResult openSubResult(ExtractionContext context) {
return out.openSubResult(context);
}
/**
* Helper method that adds a literal property to a subject only if the value of the property is a valid string.
*
* @param n
* the <i>HTML</i> node from which the property value has been extracted.
* @param subject
* the property subject.
* @param p
* the property IRI.
* @param value
* the property value.
*
* @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
*/
protected boolean conditionallyAddStringProperty(Node n, Resource subject, IRI p, String value) {
if (value == null)
return false;
value = value.trim();
return value.length() > 0 && conditionallyAddLiteralProperty(n, subject, p, valueFactory.createLiteral(value));
}
/**
* Helper method that adds a literal property to a node.
*
* @param n
* the <i>HTML</i> node from which the property value has been extracted.
* @param subject
* subject the property subject.
* @param property
* the property IRI.
* @param literal
* value the property value.
*
* @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
*/
protected boolean conditionallyAddLiteralProperty(Node n, Resource subject, IRI property, Literal literal) {
final String literalStr = literal.stringValue();
if (containsScriptBlock(literalStr)) {
out.notifyIssue(IssueReport.IssueLevel.WARNING,
String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr), -1, -1);
return false;
}
out.writeTriple(subject, property, literal);
TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n));
return true;
}
/**
* Helper method that adds a IRI property to a node.
*
* @param subject
* the property subject.
* @param property
* the property IRI.
* @param uri
* the property object.
*
* @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
*/
protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
if (uri == null)
return false;
out.writeTriple(subject, property, uri);
return true;
}
/**
* Helper method that adds a BNode property to a node.
*
* @param n
* the <i>HTML</i> node used for extracting such property.
* @param subject
* the property subject.
* @param property
* the property IRI.
* @param bnode
* the property value.
*/
protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
out.writeTriple(subject, property, bnode);
TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n));
}
/**
* Helper method that adds a BNode property to a node.
*
* @param subject
* the property subject.
* @param property
* the property IRI.
* @param bnode
* the property value.
*/
protected void addBNodeProperty(Resource subject, IRI property, BNode bnode) {
out.writeTriple(subject, property, bnode);
}
/**
* Helper method that adds a IRI property to a node.
*
* @param subject
* subject to add
* @param property
* predicate to add
* @param object
* object to add
*/
protected void addIRIProperty(Resource subject, IRI property, IRI object) {
out.writeTriple(subject, property, object);
}
protected IRI fixLink(String link) {
return valueFactory.fixLink(link, null);
}
protected IRI fixLink(String link, String defaultSchema) {
return valueFactory.fixLink(link, defaultSchema);
}
private boolean containsScriptBlock(String in) {
final String inLowerCase = in.toLowerCase(Locale.ROOT);
final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
if (beginBlock == -1) {
return false;
}
return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
}
/**
* This method checks if there is a native nesting relationship between two {@link MicroformatExtractor}.
*
* @see org.apache.any23.extractor.html.annotations.Includes
*
* @param including
* the including {@link MicroformatExtractor}
* @param included
* the included {@link MicroformatExtractor}
*
* @return <code>true</code> if there is a declared nesting relationship
*/
public static boolean includes(Class<? extends MicroformatExtractor> including,
Class<? extends MicroformatExtractor> included) {
Includes includes = including.getAnnotation(Includes.class);
if (includes != null) {
Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
if (extractors != null && extractors.length > 0) {
for (Class<? extends MicroformatExtractor> extractor : extractors) {
if (extractor.equals(included)) {
return true;
}
}
}
}
return false;
}
}