blob: a599cdfb3b0e4b996cbc77c4f4898961ae71602c [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.any23.extractor;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.encoding.EncodingDetector;
import org.apache.any23.encoding.TikaEncodingDetector;
import org.apache.any23.extractor.html.DocumentReport;
import org.apache.any23.extractor.html.HTMLDocument;
import org.apache.any23.extractor.html.MicroformatExtractor;
import org.apache.any23.extractor.html.TagSoupParser;
import org.apache.any23.mime.MIMEType;
import org.apache.any23.mime.MIMETypeDetector;
import org.apache.any23.rdf.Any23ValueFactoryWrapper;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.LocalCopyFactory;
import org.apache.any23.source.MemCopyFactory;
import org.apache.any23.validator.EmptyValidationReport;
import org.apache.any23.validator.ValidatorException;
import org.apache.any23.vocab.SINDICE;
import org.apache.any23.writer.CompositeTripleHandler;
import org.apache.any23.writer.CountingTripleHandler;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.any23.extractor.Extractor.BlindExtractor;
import org.apache.any23.extractor.Extractor.ContentExtractor;
import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
import org.apache.tika.mime.MimeTypes;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
* This class acts as facade where all the extractors were called on a single document.
public class SingleDocumentExtraction {
private static final SINDICE vSINDICE = SINDICE.getInstance();
private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
private final Configuration configuration;
private final DocumentSource in;
private IRI documentIRI;
private final ExtractorGroup extractors;
private final TripleHandler output;
private final EncodingDetector encoderDetector;
private LocalCopyFactory copyFactory = null;
private DocumentSource localDocumentSource = null;
private MIMETypeDetector detector = null;
private ExtractorGroup matchingExtractors = null;
private MIMEType detectedMIMEType = null;
private DocumentReport documentReport = null;
private ExtractionParameters tagSoupDOMRelatedParameters = null;
private String parserEncoding = null;
* Builds an extractor by the specification of document source, list of extractors and output triple handler.
* @param configuration
* configuration applied during extraction.
* @param in
* input document source.
* @param extractors
* list of extractors to be applied.
* @param output
* output triple handler.
public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorGroup extractors,
TripleHandler output) {
if (configuration == null)
throw new NullPointerException("configuration cannot be null.");
if (in == null)
throw new NullPointerException("in cannot be null.");
this.configuration = configuration; = in;
this.extractors = extractors;
List<TripleHandler> tripleHandlers = new ArrayList<>();
tripleHandlers.add(new CountingTripleHandler());
this.output = new CompositeTripleHandler(tripleHandlers);
this.encoderDetector = new TikaEncodingDetector();
* Builds an extractor by the specification of document source, extractors factory and output triple handler.
* @param configuration
* configuration applied during extraction.
* @param in
* input document source.
* @param factory
* the extractors factory.
* @param output
* output triple handler.
public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorFactory<?> factory,
TripleHandler output) {
this(configuration, in, new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
* Builds an extractor by the specification of document source, extractors factory and output triple handler, using
* the {@link org.apache.any23.configuration.DefaultConfiguration}.
* @param in
* input document source.
* @param factory
* the extractors factory.
* @param output
* output triple handler.
public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
this(DefaultConfiguration.singleton(), in,
new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
* Sets the internal factory for generating the document local copy, if <code>null</code> the
* {@link org.apache.any23.source.MemCopyFactory} will be used.
* @param copyFactory
* local copy factory.
* @see org.apache.any23.source.DocumentSource
public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
this.copyFactory = copyFactory;
* Sets the internal mime type detector, if <code>null</code> mimetype detection will be skipped and all extractors
* will be activated.
* @param detector
* detector instance.
public void setMIMETypeDetector(MIMETypeDetector detector) {
this.detector = detector;
* Triggers the execution of all the {@link Extractor} registered to this class using the specified extraction
* parameters.
* @param extractionParameters
* the parameters applied to the run execution.
* @return the report generated by the extraction.
* @throws ExtractionException
* if an error occurred during the data extraction.
* @throws IOException
* if an error occurred during the data access.
public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
throws ExtractionException, IOException {
if (extractionParameters == null) {
extractionParameters = ExtractionParameters.newDefault(configuration);
final String contextIRI = extractionParameters
try {
this.documentIRI = new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance())
.createIRI("?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
} catch (Exception ex) {
throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
if (log.isDebugEnabled()) {
log.debug("Processing " + this.documentIRI);
if (log.isDebugEnabled()) {
StringBuilder sb = new StringBuilder("Extractors ");
for (ExtractorFactory<?> factory : matchingExtractors) {
sb.append(' ');
sb.append("match ").append(documentIRI);
final List<ResourceRoot> resourceRoots = new ArrayList<>();
final List<PropertyPath> propertyPaths = new ArrayList<>();
final Map<String, Collection<IssueReport.Issue>> extractorToIssues = new HashMap<>();
// Invoke all extractors.
try {
} catch (TripleHandlerException e) {
log.error(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI));
throw new ExtractionException(
String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
try {
// Create the document context.
final String documentLanguage;
try {
documentLanguage = extractDocumentLanguage(extractionParameters);
ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
ArrayList<String> intersectionOfRdfMimetypes = null;
for (ExtractorFactory<?> factory : matchingExtractors) {
final Extractor<?> extractor = factory.createExtractor();
final SingleExtractionReport er = runExtractor(extractionParameters, documentLanguage, extractor);
// Fix for ANY23-415:
if (mimeTypeIsTooGeneric) {
List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
.filter(mt -> !isTooGeneric(mt)).map(MIMEType::getFullType)
if (er.touched) {
// If detected mimetype is too generic, but we find extractors matching
// this mimetype that are capable of producing RDF triples from this resource,
// and these extractors are also associated with more specific RDF mimetypes,
// then we can simply take the intersection of these more specific mimetypes
// to narrow down the generic, non-RDF mimetype to a specific RDF mimetype.
if (intersectionOfRdfMimetypes == null) {
intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
} else {
} else if (!rdfMimetypes.isEmpty()) {
// If detected mimetype is too generic, and this extractor matches both the
// generic mimetype and a more specific mimetype, but did not produce any RDF
// triples, then we can safely assume that this extractor is not actually a
// match for the type of file we are parsing (e.g., a "humans.txt" file).
extractorToIssues.put(factory.getExtractorName(), er.issues);
matchingExtractors = new ExtractorGroup(filteredList);
if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
// If the detected mimetype is a generic, non-RDF mimetype, and the intersection
// of specific RDF mimetypes across all triple-producing extractors is non-empty,
// simply replace the generic mimetype with a specific RDF mimetype in that intersection.
detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
} catch (ValidatorException ve) {
throw new ExtractionException("An error occurred during the validation phase.", ve);
// Resource consolidation.
final boolean addDomainTriples = extractionParameters
final ExtractionContext consolidationContext;
if (extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
// Consolidation with nesting.
consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output,
} else {
consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
// Adding time/size meta triples.
if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
try {
} catch (TripleHandlerException e) {
throw new ExtractionException(
"Error while adding extraction metadata triples document with IRI %s", documentIRI),
} finally {
try {
} catch (TripleHandlerException e) {
log.error(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI));
throw new ExtractionException(
String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI), e);
return new SingleDocumentExtractionReport(
documentReport == null ? EmptyValidationReport.getInstance() : documentReport.getReport(),
private static boolean isTooGeneric(MIMEType type) {
if (type == null || type.isAnySubtype()) {
return true;
String mt = type.getFullType();
return mt.equals(MimeTypes.PLAIN_TEXT) || mt.equals(MimeTypes.OCTET_STREAM) || mt.equals(MimeTypes.XML);
* Triggers the execution of all the {@link Extractor} registered to this class using the <i>default</i> extraction
* parameters.
* @throws IOException
* if there is an error reading input from the document source
* @throws ExtractionException
* if there is an error duing distraction
* @return the extraction report.
public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
return run(ExtractionParameters.newDefault(configuration));
* Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
* @return string containing the detected mimetype.
* @throws IOException
* if an error occurred while accessing the data.
public String getDetectedMIMEType() throws IOException {
return detectedMIMEType == null ? null : detectedMIMEType.toString();
* Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an
* extractor.
* @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
* @throws IOException
* if there is an error locating matching extractors
public boolean hasMatchingExtractors() throws IOException {
return !matchingExtractors.isEmpty();
* @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
public List<Extractor> getMatchingExtractors() {
final List<Extractor> extractorsList = new ArrayList<>();
for (ExtractorFactory extractorFactory : matchingExtractors) {
return extractorsList;
* @return the configured parsing encoding.
public String getParserEncoding() {
if (this.parserEncoding == null) {
this.parserEncoding = detectEncoding();
return this.parserEncoding;
* Sets the document parser encoding.
* @param encoding
* parser encoding.
public void setParserEncoding(String encoding) {
this.parserEncoding = encoding;
documentReport = null;
* Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
* @return <code>true</code> if the document source is an HTML document.
* @throws IOException
* if an error occurs while accessing data.
private boolean isHTMLDocument() throws IOException {
return !matchingExtractors.filterByMIMEType(MIMEType.parse("text/html")).isEmpty();
* Extracts the document language where possible.
* @param extractionParameters
* extraction parameters to be applied to determine the document language.
* @return the document language if any, <code>null</code> otherwise.
* @throws
* if an error occurs during the document analysis.
* @throws org.apache.any23.validator.ValidatorException
private String extractDocumentLanguage(ExtractionParameters extractionParameters)
throws IOException, ValidatorException {
if (!isHTMLDocument()) {
return null;
final HTMLDocument document;
try {
document = new HTMLDocument(getTagSoupDOM(extractionParameters).getDocument());
} catch (IOException ioe) {
log.debug("Cannot extract language from document.", ioe);
return null;
return document.getDefaultLanguage();
* Generates a list of extractors that can be applied to the given document.
* @throws IOException
private void filterExtractorsByMIMEType() throws IOException {
if (matchingExtractors != null)
return; // has already been run.
if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
matchingExtractors = extractors;
// detect MIME based on the real file IRI rather than based on given base namespace
detectedMIMEType = detector.guessMIMEType(,
localDocumentSource.openInputStream(), MIMEType.parse(localDocumentSource.getContentType()));
log.debug("detected media type: " + detectedMIMEType);
matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
* Triggers the execution of a specific {@link Extractor}.
* @param extractionParameters
* the parameters used for the extraction.
* @param extractor
* the {@link Extractor} to be executed.
* @throws ExtractionException
* if an error specific to an extractor happens.
* @throws IOException
* if an IO error occurs during the extraction.
* @return the roots of the resources that have been extracted.
* @throws org.apache.any23.validator.ValidatorException
* if an error occurs during validation.
private SingleExtractionReport runExtractor(final ExtractionParameters extractionParameters,
final String documentLanguage, final Extractor<?> extractor)
throws ExtractionException, IOException, ValidatorException {
if (log.isDebugEnabled()) {
log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
long startTime = System.currentTimeMillis();
final ExtractionContext extractionContext = new ExtractionContext(extractor.getDescription().getExtractorName(),
documentIRI, documentLanguage);
final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
try {
if (extractor instanceof BlindExtractor) {
final BlindExtractor blindExtractor = (BlindExtractor) extractor;, extractionContext, documentIRI, extractionResult);
} else if (extractor instanceof ContentExtractor) {
final ContentExtractor contentExtractor = (ContentExtractor) extractor;, extractionContext, localDocumentSource.openInputStream(),
} else if (extractor instanceof TagSoupDOMExtractor) {
final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
final DocumentReport documentReport = getTagSoupDOM(extractionParameters);, extractionContext, documentReport.getDocument(),
} else {
throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
return new SingleExtractionReport(extractionResult.getIssues(),
new ArrayList<ResourceRoot>(extractionResult.getResourceRoots()),
new ArrayList<PropertyPath>(extractionResult.getPropertyPaths()), extractionResult.wasTouched());
} catch (ExtractionException ex) {
if (log.isDebugEnabled()) {
log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
throw ex;
} finally {
// Logging result error report.
if (log.isDebugEnabled() && extractionResult.hasIssues()) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
extractionResult.printReport(new PrintStream(baos, true, "UTF-8"));
long elapsed = System.currentTimeMillis() - startTime;
if (log.isDebugEnabled()) {
log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
* Forces the retrieval of the document data.
* @throws IOException
private void ensureHasLocalCopy() throws IOException {
if (localDocumentSource != null)
if (in.isLocal()) {
localDocumentSource = in;
if (copyFactory == null) {
copyFactory = new MemCopyFactory();
localDocumentSource = copyFactory.createLocalCopy(in);
* Returns the DOM of the given document source (that must be an HTML stream) and the report of eventual fixes
* applied on it.
* @param extractionParameters
* parameters to be used during extraction.
* @return document report.
* @throws IOException
* if an error occurs during data access.
* @throws ValidatorException
* if an error occurs during validation.
private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
throws IOException, ValidatorException {
if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
final InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
final String candidateEncoding = getParserEncoding();
final TagSoupParser tagSoupParser = new TagSoupParser(is, documentIRI.stringValue(), candidateEncoding);
if (extractionParameters.isValidate()) {
documentReport = tagSoupParser.getValidatedDOM(extractionParameters.isFix());
} else {
documentReport = new DocumentReport(EmptyValidationReport.getInstance(), tagSoupParser.getDOM());
tagSoupDOMRelatedParameters = extractionParameters;
return documentReport;
* Detects the encoding of the local document source input stream.
* @return a valid encoding value.
private String detectEncoding() {
try {
InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
return encoding;
} catch (Exception e) {
throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
* This function verifies if the <i>candidateSub</i> list of strings is a prefix of <i>list</i>.
* @param list
* a list of strings.
* @param candidateSub
* a list of strings.
* @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>, <code>false</code> otherwise.
private boolean subPath(String[] list, String[] candidateSub) {
if (candidateSub.length > list.length) {
return false;
for (int i = 0; i < candidateSub.length; i++) {
if (!candidateSub[i].equals(list[i])) {
return false;
return true;
* Adds for every resource root node a page domain triple.
* @param resourceRoots
* list of resource roots.
* @param context
* extraction context to produce triples.
* @throws ExtractionException
private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
throws ExtractionException {
try {
// Add source Web domains to every resource root.
String domain;
try {
domain = new;
} catch (URISyntaxException urise) {
throw new IllegalArgumentException("An error occurred while extracting the host from the document IRI.",
if (domain != null) {
for (ResourceRoot resourceRoot : resourceRoots) {
output.receiveTriple(resourceRoot.getRoot(), vSINDICE.getProperty(SINDICE.DOMAIN),
SimpleValueFactory.getInstance().createLiteral(domain), null, context);
} catch (TripleHandlerException e) {
throw new ExtractionException("Error while writing triple triple.", e);
} finally {
try {
} catch (TripleHandlerException e) {
throw new ExtractionException("Error while closing context.", e);
* @return an extraction context specific for consolidation triples.
private ExtractionContext createExtractionContext(String defaultLanguage) {
return new ExtractionContext("consolidation-extractor", documentIRI, defaultLanguage,
* Detect the nesting relationship among different Microformats and explicit them adding connection triples.
* @param resourceRoots
* @param propertyPaths
* @param context
* @throws TripleHandlerException
private void addNestingRelationship(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
ExtractionContext context) throws TripleHandlerException {
ResourceRoot currentResourceRoot;
PropertyPath currentPropertyPath;
for (int r = 0; r < resourceRoots.size(); r++) {
currentResourceRoot = resourceRoots.get(r);
for (int p = 0; p < propertyPaths.size(); p++) {
currentPropertyPath = propertyPaths.get(p);
Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
// Avoid wrong nesting relationships.
if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
// Avoid self declaring relationships
if (MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
* This method consolidates the graphs extracted from the same document. In particular it adds:
* <ul>
* <li>for every microformat root node a triple indicating the original Web page domain;</li>
* <li>triples indicating the nesting relationship among a microformat root and property paths of other nested
* microformats.</li>
* </ul>
* @param resourceRoots
* list of RDF nodes representing roots of extracted microformat graphs and the corresponding HTML paths.
* @param propertyPaths
* list of RDF nodes representing property subjects, property IRIs and the HTML paths from which such
* properties have been extracted.
* @param addDomainTriples
* @param output
* a triple handler event collector.
* @return
* @throws ExtractionException
private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
boolean addDomainTriples, TripleHandler output, String defaultLanguage) throws ExtractionException {
final ExtractionContext context = createExtractionContext(defaultLanguage);
try {
} catch (TripleHandlerException e) {
throw new ExtractionException(
String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
try {
if (addDomainTriples) {
addDomainTriplesPerResourceRoots(resourceRoots, context);
addNestingRelationship(resourceRoots, propertyPaths, context);
} catch (TripleHandlerException the) {
throw new ExtractionException("Error while writing triple triple.", the);
} finally {
try {
} catch (TripleHandlerException e) {
throw new ExtractionException("Error while closing context.", e);
return context;
* This method consolidates the graphs extracted from the same document. In particular it adds:
* <ul>
* <li>for every microformat root node a triple indicating the original Web page domain;</li>
* </ul>
* @param resourceRoots
* list of RDF nodes representing roots of extracted microformat graphs and the corresponding HTML paths.
* from which such properties have been extracted.
* @param addDomainTriples
* @param output
* a triple handler event collector.
* @return
* @throws ExtractionException
private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, boolean addDomainTriples,
TripleHandler output, String defaultLanguage) throws ExtractionException {
final ExtractionContext context = createExtractionContext(defaultLanguage);
try {
} catch (TripleHandlerException e) {
throw new ExtractionException(
String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
try {
if (addDomainTriples) {
addDomainTriplesPerResourceRoots(resourceRoots, context);
} finally {
try {
} catch (TripleHandlerException the) {
throw new ExtractionException("Error while closing context.", the);
return context;
* Adds metadata triples containing the number of extracted triples and the extraction timestamp.
* @param context
* @throws TripleHandlerException
private void addExtractionTimeSizeMetaTriples(ExtractionContext context) throws TripleHandlerException {
// adding extraction date
String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
vSINDICE.getProperty(SINDICE.DATE), SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
null, context);
// adding number of extracted triples
int numberOfTriples = 0;
CompositeTripleHandler cth = (CompositeTripleHandler) output;
for (TripleHandler th : cth.getChilds()) {
if (th instanceof CountingTripleHandler) {
numberOfTriples = ((CountingTripleHandler) th).getCount();
vSINDICE.getProperty(SINDICE.SIZE), SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the
// number
// of
// triples
// plus
// itself
null, context);
* Creates a nesting relationship triple.
* @param from
* the property containing the nested microformat.
* @param to
* the root to the nested microformat.
* @param th
* the triple handler.
* @param ec
* the extraction context used to add such information.
* @throws org.apache.any23.writer.TripleHandlerException
private void createNestingRelationship(PropertyPath from, ResourceRoot to, TripleHandler th, ExtractionContext ec)
throws TripleHandlerException {
final BNode fromObject = from.getObject();
final String bNodeHash = from.getProperty().stringValue() + (fromObject == null ? "" : fromObject.getID());
BNode bnode = RDFUtils.getBNode(bNodeHash);
th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec);
th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
from.getObject() == null ? to.getRoot() : from.getObject(), null, ec);
th.receiveTriple(from.getSubject(), vSINDICE.getProperty(SINDICE.NESTING), bnode, null, ec);
* Entity detection report.
private static class SingleExtractionReport {
private final Collection<IssueReport.Issue> issues;
private final List<ResourceRoot> resourceRoots;
private final List<PropertyPath> propertyPaths;
private final boolean touched;
public SingleExtractionReport(Collection<IssueReport.Issue> issues, List<ResourceRoot> resourceRoots,
List<PropertyPath> propertyPaths, boolean wasTouched) {
this.issues = issues;
this.resourceRoots = resourceRoots;
this.propertyPaths = propertyPaths;
this.touched = wasTouched;