core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor;

 import org.apache.any23.configuration.Configuration;
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.encoding.EncodingDetector;
 import org.apache.any23.encoding.TikaEncodingDetector;
 import org.apache.any23.extractor.html.DocumentReport;
 import org.apache.any23.extractor.html.HTMLDocument;
 import org.apache.any23.extractor.html.MicroformatExtractor;
 import org.apache.any23.extractor.html.TagSoupParser;
 import org.apache.any23.mime.MIMEType;
 import org.apache.any23.mime.MIMETypeDetector;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.source.DocumentSource;
 import org.apache.any23.source.LocalCopyFactory;
 import org.apache.any23.source.MemCopyFactory;
 import org.apache.any23.validator.EmptyValidationReport;
 import org.apache.any23.validator.ValidatorException;
 import org.apache.any23.vocab.SINDICE;
 import org.apache.any23.writer.CompositeTripleHandler;
 import org.apache.any23.writer.CountingTripleHandler;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
 import org.apache.any23.extractor.Extractor.BlindExtractor;
 import org.apache.any23.extractor.Extractor.ContentExtractor;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.apache.tika.mime.MimeTypes;
 import org.eclipse.rdf4j.model.BNode;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.BufferedInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.net.URISyntaxException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.UUID;
 import java.util.stream.Collectors;

 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;

 /**
  * This class acts as facade where all the extractors were called on a single document.
  */
 public class SingleDocumentExtraction {

     private static final SINDICE vSINDICE = SINDICE.getInstance();

     private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);

     private final Configuration configuration;

     private final DocumentSource in;

     private IRI documentIRI;

     private final ExtractorGroup extractors;

     private final TripleHandler output;

     private final EncodingDetector encoderDetector;

     private LocalCopyFactory copyFactory = null;

     private DocumentSource localDocumentSource = null;

     private MIMETypeDetector detector = null;

     private ExtractorGroup matchingExtractors = null;

     private MIMEType detectedMIMEType = null;

     private DocumentReport documentReport = null;

     private ExtractionParameters tagSoupDOMRelatedParameters = null;

     private String parserEncoding = null;

     /**
      * Builds an extractor by the specification of document source,
      * list of extractors and output triple handler.
      *
      * @param configuration configuration applied during extraction.
      * @param in input document source.
      * @param extractors list of extractors to be applied.
      * @param output output triple handler.
      */
     public SingleDocumentExtraction(
             Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
     ) {
         if(configuration == null)
             throw new NullPointerException("configuration cannot be null.");
         if(in == null)
             throw new NullPointerException("in cannot be null.");
         this.configuration = configuration;
         this.in = in;
         this.extractors = extractors;

         List<TripleHandler> tripleHandlers = new ArrayList<>();
         tripleHandlers.add(output);
         tripleHandlers.add(new CountingTripleHandler());
         this.output = new CompositeTripleHandler(tripleHandlers);
         this.encoderDetector = new TikaEncodingDetector();
     }

     /**
      * Builds an extractor by the specification of document source,
      * extractors factory and output triple handler.
      *
      * @param configuration configuration applied during extraction.
      * @param in input document source.
      * @param factory the extractors factory.
      * @param output output triple handler.
      */
     public SingleDocumentExtraction(
             Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
     ) {
         this(
                 configuration,
                 in,
                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
                 output
         );
         this.setMIMETypeDetector(null);
     }

     /**
      * Builds an extractor by the specification of document source,
      * extractors factory and output triple handler, using the
      * {@link org.apache.any23.configuration.DefaultConfiguration}.
      *
      * @param in input document source.
      * @param factory the extractors factory.
      * @param output output triple handler.
      */
     public SingleDocumentExtraction(
         DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
     ) {
         this(
                 DefaultConfiguration.singleton(),
                 in,
                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
                 output
         );
         this.setMIMETypeDetector(null);
     }

     /**
      * Sets the internal factory for generating the document local copy,
      * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used.
      *
      * @param copyFactory local copy factory.
      * @see org.apache.any23.source.DocumentSource
      */
     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
         this.copyFactory = copyFactory;
     }

     /**
      * Sets the internal mime type detector,
      * if <code>null</code> mimetype detection will
      * be skipped and all extractors will be activated.
      *
      * @param detector detector instance.
      */
     public void setMIMETypeDetector(MIMETypeDetector detector) {
         this.detector = detector;
     }

     /**
      * Triggers the execution of all the {@link Extractor}
      * registered to this class using the specified extraction parameters.
      *
      * @param extractionParameters the parameters applied to the run execution.
      * @return the report generated by the extraction.
      * @throws ExtractionException if an error occurred during the data extraction.
      * @throws IOException if an error occurred during the data access.
      */
     public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
     throws ExtractionException, IOException {
         if(extractionParameters == null) {
             extractionParameters = ExtractionParameters.newDefault(configuration);
         }

         final String contextIRI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
         ensureHasLocalCopy();
         try {
             this.documentIRI = new Any23ValueFactoryWrapper(
                     SimpleValueFactory.getInstance()
             ).createIRI( "?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
         } catch (Exception ex) {
             throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
         }
         if (log.isDebugEnabled()) {
             log.debug("Processing " + this.documentIRI);
         }
         filterExtractorsByMIMEType();

         if(log.isDebugEnabled()) {
             StringBuilder sb = new StringBuilder("Extractors ");
             for (ExtractorFactory<?> factory : matchingExtractors) {
                 sb.append(factory.getExtractorName());
                 sb.append(' ');
             }
             sb.append("match ").append(documentIRI);
             log.debug(sb.toString());
         }

         final List<ResourceRoot> resourceRoots = new ArrayList<>();
         final List<PropertyPath> propertyPaths = new ArrayList<>();
         final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
             new HashMap<>();

         // Invoke all extractors.
         try {
             output.startDocument(documentIRI);
         } catch (TripleHandlerException e) {
             log.error(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI));
             throw new ExtractionException(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI),
                     e
             );
         }
         try {
 	        output.setContentLength(in.getContentLength());
 	        // Create the document context.
             final String documentLanguage;
 	        try {
 	            documentLanguage = extractDocumentLanguage(extractionParameters);
 	            ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
                 final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
                 ArrayList<String> intersectionOfRdfMimetypes = null;
                 for (ExtractorFactory<?> factory : matchingExtractors) {
 	                final Extractor<?> extractor = factory.createExtractor();
 	                final SingleExtractionReport er = runExtractor(
 	                        extractionParameters,
 	                        documentLanguage,
 	                        extractor
 	                );
 	                // Fix for ANY23-415:
                     if (mimeTypeIsTooGeneric) {
                         List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
                                 .filter(mt -> !isTooGeneric(mt))
                                 .map(MIMEType::getFullType)
                                 .collect(Collectors.toList());
                         if (er.touched) {
                             // If detected mimetype is too generic, but we find extractors matching
                             // this mimetype that are capable of producing RDF triples from this resource,
                             // and these extractors are also associated with more specific RDF mimetypes,
                             // then we can simply take the intersection of these more specific mimetypes
                             // to narrow down the generic, non-RDF mimetype to a specific RDF mimetype.
                             if (intersectionOfRdfMimetypes == null) {
                                 intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
                             } else {
                                 intersectionOfRdfMimetypes.retainAll(rdfMimetypes);
                             }
                         } else if (!rdfMimetypes.isEmpty()) {
                             // If detected mimetype is too generic, and this extractor matches both the
                             // generic mimetype and a more specific mimetype, but did not produce any RDF
                             // triples, then we can safely assume that this extractor is not actually a
                             // match for the type of file we are parsing (e.g., a "humans.txt" file).
                             continue;
                         }
                     }
 	                resourceRoots.addAll( er.resourceRoots );
 	                propertyPaths.addAll( er.propertyPaths );
 	                filteredList.add(factory);
 	                extractorToIssues.put(factory.getExtractorName(), er.issues);
 	            }
                 matchingExtractors = new ExtractorGroup(filteredList);
                 if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
                     // If the detected mimetype is a generic, non-RDF mimetype, and the intersection
                     // of specific RDF mimetypes across all triple-producing extractors is non-empty,
                     // simply replace the generic mimetype with a specific RDF mimetype in that intersection.
                     detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
                 }
 	        } catch(ValidatorException ve) {
 	            throw new ExtractionException("An error occurred during the validation phase.", ve);
 	        }

 	        // Resource consolidation.
 	        final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
 	        final ExtractionContext consolidationContext;
 	        if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
 	            // Consolidation with nesting.
 	            consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output, documentLanguage);
 	        } else {
 	            consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
 	        }

 	        // Adding time/size meta triples.
 	        if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
 	            try {
 	                addExtractionTimeSizeMetaTriples(consolidationContext);
 	            } catch (TripleHandlerException e) {
 	                throw new ExtractionException(
 	                        String.format(Locale.ROOT,
 	                                "Error while adding extraction metadata triples document with IRI %s", documentIRI
 	                        ),
 	                        e
 	                );
 	            }
 	        }
         } finally {
 	        try {
 	            output.endDocument(documentIRI);
 	        } catch (TripleHandlerException e) {
 	            log.error(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI));
 	            throw new ExtractionException(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI),
 	                    e
 	            );
 	        }
         }

         return new SingleDocumentExtractionReport(
                 documentReport == null
                         ?
                 EmptyValidationReport.getInstance() : documentReport.getReport(),
                 extractorToIssues
         );
     }

     private static boolean isTooGeneric(MIMEType type) {
         if (type == null || type.isAnySubtype()) {
             return true;
         }
         String mt = type.getFullType();
         return mt.equals(MimeTypes.PLAIN_TEXT)
                 || mt.equals(MimeTypes.OCTET_STREAM)
                 || mt.equals(MimeTypes.XML);
     }

     /**
      * Triggers the execution of all the {@link Extractor}
      * registered to this class using the <i>default</i> extraction parameters.
      *
      * @throws IOException if there is an error reading input from the document source
      * @throws ExtractionException if there is an error duing distraction
      * @return the extraction report.
      */
     public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
         return run(ExtractionParameters.newDefault(configuration));
     }

     /**
      * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
      *
      * @return string containing the detected mimetype.
      * @throws IOException if an error occurred while accessing the data.
      */
     public String getDetectedMIMEType() throws IOException {
         filterExtractorsByMIMEType();
         return  detectedMIMEType == null ? null : detectedMIMEType.toString();
     }

     /**
      * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor.
      *
      * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
      * @throws IOException if there is an error locating matching extractors
      */
     public boolean hasMatchingExtractors() throws IOException {
         filterExtractorsByMIMEType();
         return !matchingExtractors.isEmpty();
     }

     /**
      * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
      */
     @SuppressWarnings("rawtypes")
     public List<Extractor> getMatchingExtractors() {
         final List<Extractor> extractorsList = new ArrayList<>();
         for(ExtractorFactory extractorFactory : matchingExtractors) {
             extractorsList.add( extractorFactory.createExtractor() );
         }
         return extractorsList;
     }

     /**
      * @return the configured parsing encoding.
      */
     public String getParserEncoding() {
         if(this.parserEncoding == null) {
             this.parserEncoding = detectEncoding();
         }
         return this.parserEncoding;
     }

     /**
      * Sets the document parser encoding.
      *
      * @param encoding parser encoding.
      */
     public void setParserEncoding(String encoding) {
         this.parserEncoding = encoding;
         documentReport = null;
     }

     /**
      * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
      *
      * @return <code>true</code> if the document source is an HTML document.
      * @throws IOException if an error occurs while accessing data.
      */
     private boolean isHTMLDocument() throws IOException {
         filterExtractorsByMIMEType();
         return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
     }

     /**
      * Extracts the document language where possible.
      *
      * @param extractionParameters extraction parameters to be applied to determine the document language.
      * @return the document language if any, <code>null</code> otherwise.
      * @throws java.io.IOException if an error occurs during the document analysis.
      * @throws org.apache.any23.validator.ValidatorException
      */
     private String extractDocumentLanguage(ExtractionParameters extractionParameters)
     throws IOException, ValidatorException {
         if( ! isHTMLDocument() ) {
             return null;
         }
         final HTMLDocument document;
         try {
             document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
         } catch (IOException ioe) {
             log.debug("Cannot extract language from document.", ioe);
             return null;
         }
         return document.getDefaultLanguage();
     }

     /**
      * Generates a list of extractors that can be applied to the given document.
      *
      * @throws IOException
      */
     private void filterExtractorsByMIMEType()
     throws IOException {
         if (matchingExtractors != null)
             return;  // has already been run.

         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
             matchingExtractors = extractors;
             return;
         }
         ensureHasLocalCopy();
         // detect MIME based on the real file IRI rather than based on given base namespace
         detectedMIMEType = detector.guessMIMEType(
                 java.net.URI.create(in.getDocumentIRI()).getPath(),
                 localDocumentSource.openInputStream(),
                 MIMEType.parse(localDocumentSource.getContentType())
         );
         log.debug("detected media type: " + detectedMIMEType);
         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
     }

     /**
      * Triggers the execution of a specific {@link Extractor}.
      *
      * @param extractionParameters the parameters used for the extraction.
      * @param extractor the {@link Extractor} to be executed.
      * @throws ExtractionException if an error specific to an extractor happens.
      * @throws IOException if an IO error occurs during the extraction.
      * @return the roots of the resources that have been extracted.
      * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation.
      */
     private SingleExtractionReport runExtractor(
             final ExtractionParameters extractionParameters,
             final String documentLanguage,
             final Extractor<?> extractor
     ) throws ExtractionException, IOException, ValidatorException {
         if(log.isDebugEnabled()) {
             log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
         }
         long startTime = System.currentTimeMillis();
         final ExtractionContext extractionContext = new ExtractionContext(
                 extractor.getDescription().getExtractorName(),
                 documentIRI,
                 documentLanguage
         );
         final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
         try {
             if (extractor instanceof BlindExtractor) {
                 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
                 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
             } else if (extractor instanceof ContentExtractor) {
                 ensureHasLocalCopy();
                 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
                 contentExtractor.run(
                         extractionParameters,
                         extractionContext,
                         localDocumentSource.openInputStream(),
                         extractionResult
                 );
             } else if (extractor instanceof TagSoupDOMExtractor) {
                 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
                 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
                 tagSoupDOMExtractor.run(
                         extractionParameters,
                         extractionContext,
                         documentReport.getDocument(),
                         extractionResult
                 );
             } else {
                 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
             }
             return
                 new SingleExtractionReport(
                     extractionResult.getIssues(),
                     new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
                     new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() ),
                     extractionResult.wasTouched()
                 );
         } catch (ExtractionException ex) {
             if(log.isDebugEnabled()) {
                 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
             }
             throw ex;
         } finally {
             // Logging result error report.
             if(log.isDebugEnabled() && extractionResult.hasIssues() ) {
                 ByteArrayOutputStream baos = new ByteArrayOutputStream();
                 extractionResult.printReport(new PrintStream(baos, true, "UTF-8"));
                 log.debug(baos.toString("UTF-8"));
             }
             extractionResult.close();

             long elapsed = System.currentTimeMillis() - startTime;
             if(log.isDebugEnabled()) {
                 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
             }
         }
     }

     /**
      * Forces the retrieval of the document data.
      *
      * @throws IOException
      */
     private void ensureHasLocalCopy() throws IOException {
         if (localDocumentSource != null)
             return;
         if (in.isLocal()) {
             localDocumentSource = in;
             return;
         }
         if (copyFactory == null) {
             copyFactory = new MemCopyFactory();
         }
         localDocumentSource = copyFactory.createLocalCopy(in);
     }

     /**
      * Returns the DOM of the given document source (that must be an HTML stream)
      * and the report of eventual fixes applied on it.
      *
      * @param extractionParameters parameters to be used during extraction.
      * @return document report.
      * @throws IOException if an error occurs during data access.
      * @throws ValidatorException if an error occurs during validation.
      */
     private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
     throws IOException, ValidatorException {
         if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
             ensureHasLocalCopy();
             final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
             is.mark(Integer.MAX_VALUE);
             final String candidateEncoding = getParserEncoding();
             is.reset();
             final TagSoupParser tagSoupParser = new TagSoupParser(
                     is,
                     documentIRI.stringValue(),
                     candidateEncoding
             );
             if(extractionParameters.isValidate()) {
                 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
             } else {
                 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
             }
             tagSoupDOMRelatedParameters = extractionParameters;
         }
         return documentReport;
     }

     /**
      * Detects the encoding of the local document source input stream.
      *
      * @return a valid encoding value.
      */
     private String detectEncoding() {
         try {
             ensureHasLocalCopy();
             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
             String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
             is.close();
             return encoding;
         } catch (Exception e) {
             throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
         }
     }

     /**
      * This function verifies if the <i>candidateSub</i> list of strings
      * is a prefix of <i>list</i>.
      *
      * @param list a list of strings.
      * @param candidateSub a list of strings.
      * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>,
      *         <code>false</code> otherwise.
      */
     private boolean subPath(String[] list, String[] candidateSub) {
         if(candidateSub.length > list.length) {
             return false;
         }
         for(int i = 0; i < candidateSub.length; i++) {
             if( ! candidateSub[i].equals(list[i])) {
                 return false;
             }
         }
         return true;
     }

     /**
      * Adds for every resource root node a page domain triple.
      *
      * @param resourceRoots list of resource roots.
      * @param context extraction context to produce triples.
      * @throws ExtractionException
      */
     private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
     throws ExtractionException {
         try {
             // Add source Web domains to every resource root.
             String domain;
             try {
                 domain = new java.net.URI(in.getDocumentIRI()).getHost();
             } catch (URISyntaxException urise) {
                 throw new IllegalArgumentException(
                         "An error occurred while extracting the host from the document IRI.",
                         urise
                 );
             }
             if (domain != null) {
                 for (ResourceRoot resourceRoot : resourceRoots) {
                     output.receiveTriple(
                             resourceRoot.getRoot(),
                             vSINDICE.getProperty(SINDICE.DOMAIN),
                             SimpleValueFactory.getInstance().createLiteral(domain),
                             null,
                             context
                     );
                 }
             }
         } catch (TripleHandlerException e) {
             throw new ExtractionException("Error while writing triple triple.", e);
         } finally {
             try {
                 output.closeContext(context);
             } catch (TripleHandlerException e) {
                 throw new ExtractionException("Error while closing context.", e);
             }
         }
     }

     /**
      * @return an extraction context specific for consolidation triples.
      */
     private ExtractionContext createExtractionContext(String defaultLanguage) {
         return new ExtractionContext(
                 "consolidation-extractor",
                 documentIRI,
                 defaultLanguage,
                 UUID.randomUUID().toString()
         );
     }

     /**
      * Detect the nesting relationship among different
      * Microformats and explicit them adding connection triples.
      *
      * @param resourceRoots
      * @param propertyPaths
      * @param context
      * @throws TripleHandlerException
      */
     private void addNestingRelationship(
             List<ResourceRoot> resourceRoots,
             List<PropertyPath> propertyPaths,
             ExtractionContext context
     ) throws TripleHandlerException {
         ResourceRoot currentResourceRoot;
         PropertyPath currentPropertyPath;
         for (int r = 0; r < resourceRoots.size(); r++) {
             currentResourceRoot = resourceRoots.get(r);
             for (int p = 0; p < propertyPaths.size(); p++) {
                 currentPropertyPath = propertyPaths.get(p);
                 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
                 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
                 // Avoid wrong nesting relationships.
                 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
                     continue;
                 }
                 // Avoid self declaring relationships
                 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
                     continue;
                 }
                 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
                     createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
                 }
             }
         }
     }

     /**
      * This method consolidates the graphs extracted from the same document.
      * In particular it adds:
      * <ul>
      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
      *   <li>triples indicating the nesting relationship among a microformat root and property paths of
      *       other nested microformats.
      *   </li>
      * </ul>
      * @param resourceRoots list of RDF nodes representing roots of
      *        extracted microformat graphs and the corresponding HTML paths.
      * @param propertyPaths list of RDF nodes representing property subjects, property IRIs and the HTML paths
      *        from which such properties have been extracted.
      * @param addDomainTriples
      * @param output a triple handler event collector.
      * @return
      * @throws ExtractionException
      */
     private ExtractionContext consolidateResources(
             List<ResourceRoot> resourceRoots,
             List<PropertyPath> propertyPaths,
             boolean addDomainTriples,
             TripleHandler output,
             String defaultLanguage
     ) throws ExtractionException {
         final ExtractionContext context = createExtractionContext(defaultLanguage);

         try {
             output.openContext(context);
         } catch (TripleHandlerException e) {
             throw new ExtractionException(
                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI),
                     e
             );
         }

         try {
             if(addDomainTriples) {
                 addDomainTriplesPerResourceRoots(resourceRoots, context);
             }
             addNestingRelationship(resourceRoots, propertyPaths, context);
         } catch (TripleHandlerException the) {
             throw new ExtractionException("Error while writing triple triple.", the);
         } finally {
             try {
                 output.closeContext(context);
             } catch (TripleHandlerException e) {
                 throw new ExtractionException("Error while closing context.", e);
             }
         }

         return context;
     }

     /**
      * This method consolidates the graphs extracted from the same document.
      * In particular it adds:
      * <ul>
      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
      * </ul>
      * @param resourceRoots list of RDF nodes representing roots of
      *        extracted microformat graphs and the corresponding HTML paths.
      *        from which such properties have been extracted.
      * @param addDomainTriples
      * @param output a triple handler event collector.
      * @return
      * @throws ExtractionException
      */
     private ExtractionContext consolidateResources(
             List<ResourceRoot> resourceRoots,
             boolean addDomainTriples,
             TripleHandler output,
             String defaultLanguage
     ) throws ExtractionException {
         final ExtractionContext context = createExtractionContext(defaultLanguage);

         try {
             output.openContext(context);
         } catch (TripleHandlerException e) {
             throw new ExtractionException(
                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI),
                     e
             );
         }

         try {
             if(addDomainTriples) {
                 addDomainTriplesPerResourceRoots(resourceRoots, context);
             }
         } finally {
             try {
                 output.closeContext(context);
             } catch (TripleHandlerException the) {
                 throw new ExtractionException("Error while closing context.", the);
             }
         }

         return context;
     }

     /**
      * Adds metadata triples containing the number of extracted triples
      * and the extraction timestamp.
      *
      * @param context
      * @throws TripleHandlerException
      */
     private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
     throws TripleHandlerException {
         // adding extraction date
         String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
         output.receiveTriple(
                 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
                 vSINDICE.getProperty(SINDICE.DATE),
                 SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
                 null,
                 context
         );

         // adding number of extracted triples
         int numberOfTriples = 0;
         CompositeTripleHandler cth = (CompositeTripleHandler) output;
         for (TripleHandler th : cth.getChilds()) {
             if (th instanceof CountingTripleHandler) {
                 numberOfTriples = ((CountingTripleHandler) th).getCount();
             }
         }
         output.receiveTriple(
                 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
                 vSINDICE.getProperty(SINDICE.SIZE),
                 SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself
                 null,
                 context
         );
     }

     /**
      * Creates a nesting relationship triple.
      *
      * @param from the property containing the nested microformat.
      * @param to the root to the nested microformat.
      * @param th the triple handler.
      * @param ec the extraction context used to add such information.
      * @throws org.apache.any23.writer.TripleHandlerException
      */
     private void createNestingRelationship(
             PropertyPath from,
             ResourceRoot to,
             TripleHandler th,
             ExtractionContext ec
     ) throws TripleHandlerException {
         final BNode fromObject = from.getObject();
         final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
         BNode bnode = RDFUtils.getBNode(bNodeHash);
         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
         th.receiveTriple(
                 bnode,
                 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
                 from.getObject() == null ? to.getRoot() : from.getObject(),
                 null,
                 ec
         );
         th.receiveTriple(
                 from.getSubject(),
                 vSINDICE.getProperty(SINDICE.NESTING),
                 bnode,
                 null,
                 ec
         );
     }

     /**
      * Entity detection report.
      */
     private static class SingleExtractionReport {
         private final Collection<IssueReport.Issue> issues;
         private final List<ResourceRoot>            resourceRoots;
         private final List<PropertyPath>            propertyPaths;
         private final boolean touched;

         public SingleExtractionReport(
                 Collection<IssueReport.Issue>  issues,
                 List<ResourceRoot> resourceRoots,
                 List<PropertyPath> propertyPaths,
                 boolean wasTouched
         ) {
             this.issues        = issues;
             this.resourceRoots = resourceRoots;
             this.propertyPaths = propertyPaths;
             this.touched = wasTouched;
         }
     }

 }