blob: 01f2e6c32ea0fd4e447c43a3996134c90b924ede [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.TagSoupExtractionResult;
import org.apache.any23.extractor.html.annotations.Includes;
import org.apache.any23.rdf.Any23ValueFactoryWrapper;
import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import java.io.IOException;
import java.util.Locale;
/**
* The abstract base class for any
* <a href="microformats.org/">Microformat specification</a> extractor.
*/
public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
public static final String BEGIN_SCRIPT = "<script>";
public static final String END_SCRIPT = "</script>";
private HTMLDocument htmlDocument;
private ExtractionContext context;
private IRI documentIRI;
private ExtractionResult out;
protected final Any23ValueFactoryWrapper valueFactory =
new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance());
/**
* Returns the description of this extractor.
*
* @return a human readable description.
*/
public abstract ExtractorDescription getDescription();
/**
* Performs the extraction of the data and writes them to the model.
* The nodes generated in the model can have any name or implicit label
* but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that
* are uniquely derivable from their position in the DOM tree, so that
* multiple extractors can merge information.
* @return true if extraction is successful
* @throws ExtractionException if there is an error during extraction
*/
protected abstract boolean extract() throws ExtractionException;
public HTMLDocument getHTMLDocument() {
return htmlDocument;
}
public ExtractionContext getExtractionContext() {
return context;
}
public IRI getDocumentIRI() {
return documentIRI;
}
public final void run(
ExtractionParameters extractionParameters,
ExtractionContext extractionContext,
Document in,
ExtractionResult out
) throws IOException, ExtractionException {
this.htmlDocument = new HTMLDocument(in);
this.context = extractionContext;
this.documentIRI = extractionContext.getDocumentIRI();
this.out = out;
valueFactory.setIssueReport(out);
try {
extract();
} finally {
valueFactory.setIssueReport(null);
}
}
/**
* Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
* to the extraction session.
*
* @return a valid extraction result.
*/
protected ExtractionResult getCurrentExtractionResult() {
return out;
}
protected void setCurrentExtractionResult(ExtractionResult out) {
this.out = out;
}
protected ExtractionResult openSubResult(ExtractionContext context) {
return out.openSubResult(context);
}
/**
* Helper method that adds a literal property to a subject only if the value of the property
* is a valid string.
*
* @param n the <i>HTML</i> node from which the property value has been extracted.
* @param subject the property subject.
* @param p the property IRI.
* @param value the property value.
* @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
*/
protected boolean conditionallyAddStringProperty(
Node n,
Resource subject, IRI p, String value
) {
if (value == null) return false;
value = value.trim();
return
value.length() > 0
&&
conditionallyAddLiteralProperty(
n,
subject, p, valueFactory.createLiteral(value)
);
}
/**
* Helper method that adds a literal property to a node.
*
* @param n the <i>HTML</i> node from which the property value has been extracted.
* @param subject subject the property subject.
* @param property the property IRI.
* @param literal value the property value.
* @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
*/
protected boolean conditionallyAddLiteralProperty(
Node n,
Resource subject,
IRI property,
Literal literal
) {
final String literalStr = literal.stringValue();
if( containsScriptBlock(literalStr) ) {
out.notifyIssue(
IssueReport.IssueLevel.WARNING,
String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr)
, -1
, -1
);
return false;
}
out.writeTriple(subject, property, literal);
TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
return true;
}
/**
* Helper method that adds a IRI property to a node.
* @param subject the property subject.
* @param property the property IRI.
* @param uri the property object.
* @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
*/
protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
if (uri == null) return false;
out.writeTriple(subject, property, uri);
return true;
}
/**
* Helper method that adds a BNode property to a node.
*
* @param n the <i>HTML</i> node used for extracting such property.
* @param subject the property subject.
* @param property the property IRI.
* @param bnode the property value.
*/
protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
out.writeTriple(subject, property, bnode);
TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
}
/**
* Helper method that adds a BNode property to a node.
*
* @param subject the property subject.
* @param property the property IRI.
* @param bnode the property value.
*/
protected void addBNodeProperty( Resource subject, IRI property, BNode bnode) {
out.writeTriple(subject, property, bnode);
}
/**
* Helper method that adds a IRI property to a node.
*
* @param subject subject to add
* @param property predicate to add
* @param object object to add
*/
protected void addIRIProperty(Resource subject, IRI property, IRI object) {
out.writeTriple(subject, property, object);
}
protected IRI fixLink(String link) {
return valueFactory.fixLink(link, null);
}
protected IRI fixLink(String link, String defaultSchema) {
return valueFactory.fixLink(link, defaultSchema);
}
private boolean containsScriptBlock(String in) {
final String inLowerCase = in.toLowerCase(Locale.ROOT);
final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
if(beginBlock == -1) {
return false;
}
return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
}
/**
* This method checks if there is a native nesting relationship between two
* {@link MicroformatExtractor}.
*
* @see org.apache.any23.extractor.html.annotations.Includes
* @param including the including {@link MicroformatExtractor}
* @param included the included {@link MicroformatExtractor}
* @return <code>true</code> if there is a declared nesting relationship
*/
public static boolean includes(
Class<? extends MicroformatExtractor>including,
Class<? extends MicroformatExtractor> included) {
Includes includes = including.getAnnotation(Includes.class);
if (includes != null) {
Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
if (extractors != null && extractors.length > 0) {
for (Class<? extends MicroformatExtractor> extractor : extractors) {
if (extractor.equals(included)) {
return true;
}
}
}
}
return false;
}
}