| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.extractor.html; |
| |
| import org.apache.any23.extractor.ExtractionContext; |
| import org.apache.any23.extractor.ExtractionException; |
| import org.apache.any23.extractor.ExtractionParameters; |
| import org.apache.any23.extractor.ExtractionResult; |
| import org.apache.any23.extractor.ExtractorDescription; |
| import org.apache.any23.extractor.IssueReport; |
| import org.apache.any23.extractor.TagSoupExtractionResult; |
| import org.apache.any23.extractor.html.annotations.Includes; |
| import org.apache.any23.rdf.Any23ValueFactoryWrapper; |
| import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor; |
| import org.eclipse.rdf4j.model.BNode; |
| import org.eclipse.rdf4j.model.Literal; |
| import org.eclipse.rdf4j.model.Resource; |
| import org.eclipse.rdf4j.model.IRI; |
| import org.eclipse.rdf4j.model.impl.SimpleValueFactory; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.Node; |
| |
| import java.io.IOException; |
| import java.util.Locale; |
| |
| /** |
| * The abstract base class for any |
| * <a href="microformats.org/">Microformat specification</a> extractor. |
| */ |
| public abstract class MicroformatExtractor implements TagSoupDOMExtractor { |
| |
| public static final String BEGIN_SCRIPT = "<script>"; |
| public static final String END_SCRIPT = "</script>"; |
| |
| private HTMLDocument htmlDocument; |
| |
| private ExtractionContext context; |
| |
| private IRI documentIRI; |
| |
| private ExtractionResult out; |
| |
| protected final Any23ValueFactoryWrapper valueFactory = |
| new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance()); |
| |
| /** |
| * Returns the description of this extractor. |
| * |
| * @return a human readable description. |
| */ |
| public abstract ExtractorDescription getDescription(); |
| |
| /** |
| * Performs the extraction of the data and writes them to the model. |
| * The nodes generated in the model can have any name or implicit label |
| * but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that |
| * are uniquely derivable from their position in the DOM tree, so that |
| * multiple extractors can merge information. |
| * @return true if extraction is successful |
| * @throws ExtractionException if there is an error during extraction |
| */ |
| protected abstract boolean extract() throws ExtractionException; |
| |
| public HTMLDocument getHTMLDocument() { |
| return htmlDocument; |
| } |
| |
| public ExtractionContext getExtractionContext() { |
| return context; |
| } |
| |
| public IRI getDocumentIRI() { |
| return documentIRI; |
| } |
| |
| public final void run( |
| ExtractionParameters extractionParameters, |
| ExtractionContext extractionContext, |
| Document in, |
| ExtractionResult out |
| ) throws IOException, ExtractionException { |
| this.htmlDocument = new HTMLDocument(in); |
| this.context = extractionContext; |
| this.documentIRI = extractionContext.getDocumentIRI(); |
| this.out = out; |
| valueFactory.setIssueReport(out); |
| try { |
| extract(); |
| } finally { |
| valueFactory.setIssueReport(null); |
| } |
| } |
| |
| /** |
| * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated |
| * to the extraction session. |
| * |
| * @return a valid extraction result. |
| */ |
| protected ExtractionResult getCurrentExtractionResult() { |
| return out; |
| } |
| |
| protected void setCurrentExtractionResult(ExtractionResult out) { |
| this.out = out; |
| } |
| |
| protected ExtractionResult openSubResult(ExtractionContext context) { |
| return out.openSubResult(context); |
| } |
| |
| /** |
| * Helper method that adds a literal property to a subject only if the value of the property |
| * is a valid string. |
| * |
| * @param n the <i>HTML</i> node from which the property value has been extracted. |
| * @param subject the property subject. |
| * @param p the property IRI. |
| * @param value the property value. |
| * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise. |
| */ |
| protected boolean conditionallyAddStringProperty( |
| Node n, |
| Resource subject, IRI p, String value |
| ) { |
| if (value == null) return false; |
| value = value.trim(); |
| return |
| value.length() > 0 |
| && |
| conditionallyAddLiteralProperty( |
| n, |
| subject, p, valueFactory.createLiteral(value) |
| ); |
| } |
| |
| /** |
| * Helper method that adds a literal property to a node. |
| * |
| * @param n the <i>HTML</i> node from which the property value has been extracted. |
| * @param subject subject the property subject. |
| * @param property the property IRI. |
| * @param literal value the property value. |
| * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise. |
| */ |
| protected boolean conditionallyAddLiteralProperty( |
| Node n, |
| Resource subject, |
| IRI property, |
| Literal literal |
| ) { |
| final String literalStr = literal.stringValue(); |
| if( containsScriptBlock(literalStr) ) { |
| out.notifyIssue( |
| IssueReport.IssueLevel.WARNING, |
| String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr) |
| , -1 |
| , -1 |
| ); |
| return false; |
| } |
| out.writeTriple(subject, property, literal); |
| TagSoupExtractionResult tser = (TagSoupExtractionResult) out; |
| tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) ); |
| return true; |
| } |
| |
| /** |
| * Helper method that adds a IRI property to a node. |
| * @param subject the property subject. |
| * @param property the property IRI. |
| * @param uri the property object. |
| * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise. |
| */ |
| protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) { |
| if (uri == null) return false; |
| out.writeTriple(subject, property, uri); |
| return true; |
| } |
| |
| /** |
| * Helper method that adds a BNode property to a node. |
| * |
| * @param n the <i>HTML</i> node used for extracting such property. |
| * @param subject the property subject. |
| * @param property the property IRI. |
| * @param bnode the property value. |
| */ |
| protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) { |
| out.writeTriple(subject, property, bnode); |
| TagSoupExtractionResult tser = (TagSoupExtractionResult) out; |
| tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) ); |
| } |
| |
| /** |
| * Helper method that adds a BNode property to a node. |
| * |
| * @param subject the property subject. |
| * @param property the property IRI. |
| * @param bnode the property value. |
| */ |
| protected void addBNodeProperty( Resource subject, IRI property, BNode bnode) { |
| out.writeTriple(subject, property, bnode); |
| } |
| |
| /** |
| * Helper method that adds a IRI property to a node. |
| * |
| * @param subject subject to add |
| * @param property predicate to add |
| * @param object object to add |
| */ |
| protected void addIRIProperty(Resource subject, IRI property, IRI object) { |
| out.writeTriple(subject, property, object); |
| } |
| |
| protected IRI fixLink(String link) { |
| return valueFactory.fixLink(link, null); |
| } |
| |
| protected IRI fixLink(String link, String defaultSchema) { |
| return valueFactory.fixLink(link, defaultSchema); |
| } |
| |
| private boolean containsScriptBlock(String in) { |
| final String inLowerCase = in.toLowerCase(Locale.ROOT); |
| final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT); |
| if(beginBlock == -1) { |
| return false; |
| } |
| return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1; |
| } |
| |
| /** |
| * This method checks if there is a native nesting relationship between two |
| * {@link MicroformatExtractor}. |
| * |
| * @see org.apache.any23.extractor.html.annotations.Includes |
| * @param including the including {@link MicroformatExtractor} |
| * @param included the included {@link MicroformatExtractor} |
| * @return <code>true</code> if there is a declared nesting relationship |
| */ |
| public static boolean includes( |
| Class<? extends MicroformatExtractor>including, |
| Class<? extends MicroformatExtractor> included) { |
| Includes includes = including.getAnnotation(Includes.class); |
| if (includes != null) { |
| Class<? extends MicroformatExtractor>[] extractors = includes.extractors(); |
| if (extractors != null && extractors.length > 0) { |
| for (Class<? extends MicroformatExtractor> extractor : extractors) { |
| if (extractor.equals(included)) { |
| return true; |
| } |
| } |
| } |
| } |
| return false; |
| } |
| |
| |
| } |