| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.extractor; |
| |
| import org.apache.any23.configuration.Configuration; |
| import org.apache.any23.configuration.DefaultConfiguration; |
| import org.apache.any23.encoding.EncodingDetector; |
| import org.apache.any23.encoding.TikaEncodingDetector; |
| import org.apache.any23.extractor.html.DocumentReport; |
| import org.apache.any23.extractor.html.HTMLDocument; |
| import org.apache.any23.extractor.html.MicroformatExtractor; |
| import org.apache.any23.extractor.html.TagSoupParser; |
| import org.apache.any23.mime.MIMEType; |
| import org.apache.any23.mime.MIMETypeDetector; |
| import org.apache.any23.rdf.Any23ValueFactoryWrapper; |
| import org.apache.any23.rdf.RDFUtils; |
| import org.apache.any23.source.DocumentSource; |
| import org.apache.any23.source.LocalCopyFactory; |
| import org.apache.any23.source.MemCopyFactory; |
| import org.apache.any23.validator.EmptyValidationReport; |
| import org.apache.any23.validator.ValidatorException; |
| import org.apache.any23.vocab.SINDICE; |
| import org.apache.any23.writer.CompositeTripleHandler; |
| import org.apache.any23.writer.CountingTripleHandler; |
| import org.apache.any23.writer.TripleHandler; |
| import org.apache.any23.writer.TripleHandlerException; |
| import org.apache.any23.extractor.Extractor.BlindExtractor; |
| import org.apache.any23.extractor.Extractor.ContentExtractor; |
| import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor; |
| import org.apache.tika.mime.MimeTypes; |
| import org.eclipse.rdf4j.model.BNode; |
| import org.eclipse.rdf4j.model.IRI; |
| import org.eclipse.rdf4j.model.impl.SimpleValueFactory; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.io.BufferedInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.PrintStream; |
| import java.net.URISyntaxException; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Date; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.UUID; |
| import java.util.stream.Collectors; |
| |
| import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath; |
| import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot; |
| |
| /** |
| * This class acts as facade where all the extractors were called on a single document. |
| */ |
| public class SingleDocumentExtraction { |
| |
| private static final SINDICE vSINDICE = SINDICE.getInstance(); |
| |
| private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class); |
| |
| private final Configuration configuration; |
| |
| private final DocumentSource in; |
| |
| private IRI documentIRI; |
| |
| private final ExtractorGroup extractors; |
| |
| private final TripleHandler output; |
| |
| private final EncodingDetector encoderDetector; |
| |
| private LocalCopyFactory copyFactory = null; |
| |
| private DocumentSource localDocumentSource = null; |
| |
| private MIMETypeDetector detector = null; |
| |
| private ExtractorGroup matchingExtractors = null; |
| |
| private MIMEType detectedMIMEType = null; |
| |
| private DocumentReport documentReport = null; |
| |
| private ExtractionParameters tagSoupDOMRelatedParameters = null; |
| |
| private String parserEncoding = null; |
| |
| /** |
| * Builds an extractor by the specification of document source, |
| * list of extractors and output triple handler. |
| * |
| * @param configuration configuration applied during extraction. |
| * @param in input document source. |
| * @param extractors list of extractors to be applied. |
| * @param output output triple handler. |
| */ |
| public SingleDocumentExtraction( |
| Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output |
| ) { |
| if(configuration == null) |
| throw new NullPointerException("configuration cannot be null."); |
| if(in == null) |
| throw new NullPointerException("in cannot be null."); |
| this.configuration = configuration; |
| this.in = in; |
| this.extractors = extractors; |
| |
| List<TripleHandler> tripleHandlers = new ArrayList<>(); |
| tripleHandlers.add(output); |
| tripleHandlers.add(new CountingTripleHandler()); |
| this.output = new CompositeTripleHandler(tripleHandlers); |
| this.encoderDetector = new TikaEncodingDetector(); |
| } |
| |
| /** |
| * Builds an extractor by the specification of document source, |
| * extractors factory and output triple handler. |
| * |
| * @param configuration configuration applied during extraction. |
| * @param in input document source. |
| * @param factory the extractors factory. |
| * @param output output triple handler. |
| */ |
| public SingleDocumentExtraction( |
| Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output |
| ) { |
| this( |
| configuration, |
| in, |
| new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)), |
| output |
| ); |
| this.setMIMETypeDetector(null); |
| } |
| |
| /** |
| * Builds an extractor by the specification of document source, |
| * extractors factory and output triple handler, using the |
| * {@link org.apache.any23.configuration.DefaultConfiguration}. |
| * |
| * @param in input document source. |
| * @param factory the extractors factory. |
| * @param output output triple handler. |
| */ |
| public SingleDocumentExtraction( |
| DocumentSource in, ExtractorFactory<?> factory, TripleHandler output |
| ) { |
| this( |
| DefaultConfiguration.singleton(), |
| in, |
| new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)), |
| output |
| ); |
| this.setMIMETypeDetector(null); |
| } |
| |
| /** |
| * Sets the internal factory for generating the document local copy, |
| * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used. |
| * |
| * @param copyFactory local copy factory. |
| * @see org.apache.any23.source.DocumentSource |
| */ |
| public void setLocalCopyFactory(LocalCopyFactory copyFactory) { |
| this.copyFactory = copyFactory; |
| } |
| |
| /** |
| * Sets the internal mime type detector, |
| * if <code>null</code> mimetype detection will |
| * be skipped and all extractors will be activated. |
| * |
| * @param detector detector instance. |
| */ |
| public void setMIMETypeDetector(MIMETypeDetector detector) { |
| this.detector = detector; |
| } |
| |
| /** |
| * Triggers the execution of all the {@link Extractor} |
| * registered to this class using the specified extraction parameters. |
| * |
| * @param extractionParameters the parameters applied to the run execution. |
| * @return the report generated by the extraction. |
| * @throws ExtractionException if an error occurred during the data extraction. |
| * @throws IOException if an error occurred during the data access. |
| */ |
| public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters) |
| throws ExtractionException, IOException { |
| if(extractionParameters == null) { |
| extractionParameters = ExtractionParameters.newDefault(configuration); |
| } |
| |
| final String contextIRI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY); |
| ensureHasLocalCopy(); |
| try { |
| this.documentIRI = new Any23ValueFactoryWrapper( |
| SimpleValueFactory.getInstance() |
| ).createIRI( "?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI); |
| } catch (Exception ex) { |
| throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex); |
| } |
| if (log.isDebugEnabled()) { |
| log.debug("Processing " + this.documentIRI); |
| } |
| filterExtractorsByMIMEType(); |
| |
| if(log.isDebugEnabled()) { |
| StringBuilder sb = new StringBuilder("Extractors "); |
| for (ExtractorFactory<?> factory : matchingExtractors) { |
| sb.append(factory.getExtractorName()); |
| sb.append(' '); |
| } |
| sb.append("match ").append(documentIRI); |
| log.debug(sb.toString()); |
| } |
| |
| final List<ResourceRoot> resourceRoots = new ArrayList<>(); |
| final List<PropertyPath> propertyPaths = new ArrayList<>(); |
| final Map<String,Collection<IssueReport.Issue>> extractorToIssues = |
| new HashMap<>(); |
| |
| // Invoke all extractors. |
| try { |
| output.startDocument(documentIRI); |
| } catch (TripleHandlerException e) { |
| log.error(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI)); |
| throw new ExtractionException(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), |
| e |
| ); |
| } |
| try { |
| output.setContentLength(in.getContentLength()); |
| // Create the document context. |
| final String documentLanguage; |
| try { |
| documentLanguage = extractDocumentLanguage(extractionParameters); |
| ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors()); |
| final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType); |
| ArrayList<String> intersectionOfRdfMimetypes = null; |
| for (ExtractorFactory<?> factory : matchingExtractors) { |
| final Extractor<?> extractor = factory.createExtractor(); |
| final SingleExtractionReport er = runExtractor( |
| extractionParameters, |
| documentLanguage, |
| extractor |
| ); |
| // Fix for ANY23-415: |
| if (mimeTypeIsTooGeneric) { |
| List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream() |
| .filter(mt -> !isTooGeneric(mt)) |
| .map(MIMEType::getFullType) |
| .collect(Collectors.toList()); |
| if (er.touched) { |
| // If detected mimetype is too generic, but we find extractors matching |
| // this mimetype that are capable of producing RDF triples from this resource, |
| // and these extractors are also associated with more specific RDF mimetypes, |
| // then we can simply take the intersection of these more specific mimetypes |
| // to narrow down the generic, non-RDF mimetype to a specific RDF mimetype. |
| if (intersectionOfRdfMimetypes == null) { |
| intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes); |
| } else { |
| intersectionOfRdfMimetypes.retainAll(rdfMimetypes); |
| } |
| } else if (!rdfMimetypes.isEmpty()) { |
| // If detected mimetype is too generic, and this extractor matches both the |
| // generic mimetype and a more specific mimetype, but did not produce any RDF |
| // triples, then we can safely assume that this extractor is not actually a |
| // match for the type of file we are parsing (e.g., a "humans.txt" file). |
| continue; |
| } |
| } |
| resourceRoots.addAll( er.resourceRoots ); |
| propertyPaths.addAll( er.propertyPaths ); |
| filteredList.add(factory); |
| extractorToIssues.put(factory.getExtractorName(), er.issues); |
| } |
| matchingExtractors = new ExtractorGroup(filteredList); |
| if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) { |
| // If the detected mimetype is a generic, non-RDF mimetype, and the intersection |
| // of specific RDF mimetypes across all triple-producing extractors is non-empty, |
| // simply replace the generic mimetype with a specific RDF mimetype in that intersection. |
| detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0)); |
| } |
| } catch(ValidatorException ve) { |
| throw new ExtractionException("An error occurred during the validation phase.", ve); |
| } |
| |
| // Resource consolidation. |
| final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG); |
| final ExtractionContext consolidationContext; |
| if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) { |
| // Consolidation with nesting. |
| consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output, documentLanguage); |
| } else { |
| consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage); |
| } |
| |
| // Adding time/size meta triples. |
| if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) { |
| try { |
| addExtractionTimeSizeMetaTriples(consolidationContext); |
| } catch (TripleHandlerException e) { |
| throw new ExtractionException( |
| String.format(Locale.ROOT, |
| "Error while adding extraction metadata triples document with IRI %s", documentIRI |
| ), |
| e |
| ); |
| } |
| } |
| } finally { |
| try { |
| output.endDocument(documentIRI); |
| } catch (TripleHandlerException e) { |
| log.error(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI)); |
| throw new ExtractionException(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI), |
| e |
| ); |
| } |
| } |
| |
| return new SingleDocumentExtractionReport( |
| documentReport == null |
| ? |
| EmptyValidationReport.getInstance() : documentReport.getReport(), |
| extractorToIssues |
| ); |
| } |
| |
| private static boolean isTooGeneric(MIMEType type) { |
| if (type == null || type.isAnySubtype()) { |
| return true; |
| } |
| String mt = type.getFullType(); |
| return mt.equals(MimeTypes.PLAIN_TEXT) |
| || mt.equals(MimeTypes.OCTET_STREAM) |
| || mt.equals(MimeTypes.XML); |
| } |
| |
| /** |
| * Triggers the execution of all the {@link Extractor} |
| * registered to this class using the <i>default</i> extraction parameters. |
| * |
| * @throws IOException if there is an error reading input from the document source |
| * @throws ExtractionException if there is an error duing distraction |
| * @return the extraction report. |
| */ |
| public SingleDocumentExtractionReport run() throws IOException, ExtractionException { |
| return run(ExtractionParameters.newDefault(configuration)); |
| } |
| |
| /** |
| * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}. |
| * |
| * @return string containing the detected mimetype. |
| * @throws IOException if an error occurred while accessing the data. |
| */ |
| public String getDetectedMIMEType() throws IOException { |
| filterExtractorsByMIMEType(); |
| return detectedMIMEType == null ? null : detectedMIMEType.toString(); |
| } |
| |
| /** |
| * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor. |
| * |
| * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise. |
| * @throws IOException if there is an error locating matching extractors |
| */ |
| public boolean hasMatchingExtractors() throws IOException { |
| filterExtractorsByMIMEType(); |
| return !matchingExtractors.isEmpty(); |
| } |
| |
| /** |
| * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}. |
| */ |
| @SuppressWarnings("rawtypes") |
| public List<Extractor> getMatchingExtractors() { |
| final List<Extractor> extractorsList = new ArrayList<>(); |
| for(ExtractorFactory extractorFactory : matchingExtractors) { |
| extractorsList.add( extractorFactory.createExtractor() ); |
| } |
| return extractorsList; |
| } |
| |
| /** |
| * @return the configured parsing encoding. |
| */ |
| public String getParserEncoding() { |
| if(this.parserEncoding == null) { |
| this.parserEncoding = detectEncoding(); |
| } |
| return this.parserEncoding; |
| } |
| |
| /** |
| * Sets the document parser encoding. |
| * |
| * @param encoding parser encoding. |
| */ |
| public void setParserEncoding(String encoding) { |
| this.parserEncoding = encoding; |
| documentReport = null; |
| } |
| |
| /** |
| * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document. |
| * |
| * @return <code>true</code> if the document source is an HTML document. |
| * @throws IOException if an error occurs while accessing data. |
| */ |
| private boolean isHTMLDocument() throws IOException { |
| filterExtractorsByMIMEType(); |
| return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty(); |
| } |
| |
| /** |
| * Extracts the document language where possible. |
| * |
| * @param extractionParameters extraction parameters to be applied to determine the document language. |
| * @return the document language if any, <code>null</code> otherwise. |
| * @throws java.io.IOException if an error occurs during the document analysis. |
| * @throws org.apache.any23.validator.ValidatorException |
| */ |
| private String extractDocumentLanguage(ExtractionParameters extractionParameters) |
| throws IOException, ValidatorException { |
| if( ! isHTMLDocument() ) { |
| return null; |
| } |
| final HTMLDocument document; |
| try { |
| document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() ); |
| } catch (IOException ioe) { |
| log.debug("Cannot extract language from document.", ioe); |
| return null; |
| } |
| return document.getDefaultLanguage(); |
| } |
| |
| /** |
| * Generates a list of extractors that can be applied to the given document. |
| * |
| * @throws IOException |
| */ |
| private void filterExtractorsByMIMEType() |
| throws IOException { |
| if (matchingExtractors != null) |
| return; // has already been run. |
| |
| if (detector == null || extractors.allExtractorsSupportAllContentTypes()) { |
| matchingExtractors = extractors; |
| return; |
| } |
| ensureHasLocalCopy(); |
| // detect MIME based on the real file IRI rather than based on given base namespace |
| detectedMIMEType = detector.guessMIMEType( |
| java.net.URI.create(in.getDocumentIRI()).getPath(), |
| localDocumentSource.openInputStream(), |
| MIMEType.parse(localDocumentSource.getContentType()) |
| ); |
| log.debug("detected media type: " + detectedMIMEType); |
| matchingExtractors = extractors.filterByMIMEType(detectedMIMEType); |
| } |
| |
| /** |
| * Triggers the execution of a specific {@link Extractor}. |
| * |
| * @param extractionParameters the parameters used for the extraction. |
| * @param extractor the {@link Extractor} to be executed. |
| * @throws ExtractionException if an error specific to an extractor happens. |
| * @throws IOException if an IO error occurs during the extraction. |
| * @return the roots of the resources that have been extracted. |
| * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation. |
| */ |
| private SingleExtractionReport runExtractor( |
| final ExtractionParameters extractionParameters, |
| final String documentLanguage, |
| final Extractor<?> extractor |
| ) throws ExtractionException, IOException, ValidatorException { |
| if(log.isDebugEnabled()) { |
| log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI); |
| } |
| long startTime = System.currentTimeMillis(); |
| final ExtractionContext extractionContext = new ExtractionContext( |
| extractor.getDescription().getExtractorName(), |
| documentIRI, |
| documentLanguage |
| ); |
| final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output); |
| try { |
| if (extractor instanceof BlindExtractor) { |
| final BlindExtractor blindExtractor = (BlindExtractor) extractor; |
| blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult); |
| } else if (extractor instanceof ContentExtractor) { |
| ensureHasLocalCopy(); |
| final ContentExtractor contentExtractor = (ContentExtractor) extractor; |
| contentExtractor.run( |
| extractionParameters, |
| extractionContext, |
| localDocumentSource.openInputStream(), |
| extractionResult |
| ); |
| } else if (extractor instanceof TagSoupDOMExtractor) { |
| final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor; |
| final DocumentReport documentReport = getTagSoupDOM(extractionParameters); |
| tagSoupDOMExtractor.run( |
| extractionParameters, |
| extractionContext, |
| documentReport.getDocument(), |
| extractionResult |
| ); |
| } else { |
| throw new IllegalStateException("Extractor type not supported: " + extractor.getClass()); |
| } |
| return |
| new SingleExtractionReport( |
| extractionResult.getIssues(), |
| new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ), |
| new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() ), |
| extractionResult.wasTouched() |
| ); |
| } catch (ExtractionException ex) { |
| if(log.isDebugEnabled()) { |
| log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage()); |
| } |
| throw ex; |
| } finally { |
| // Logging result error report. |
| if(log.isDebugEnabled() && extractionResult.hasIssues() ) { |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| extractionResult.printReport(new PrintStream(baos, true, "UTF-8")); |
| log.debug(baos.toString("UTF-8")); |
| } |
| extractionResult.close(); |
| |
| long elapsed = System.currentTimeMillis() - startTime; |
| if(log.isDebugEnabled()) { |
| log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms"); |
| } |
| } |
| } |
| |
| /** |
| * Forces the retrieval of the document data. |
| * |
| * @throws IOException |
| */ |
| private void ensureHasLocalCopy() throws IOException { |
| if (localDocumentSource != null) |
| return; |
| if (in.isLocal()) { |
| localDocumentSource = in; |
| return; |
| } |
| if (copyFactory == null) { |
| copyFactory = new MemCopyFactory(); |
| } |
| localDocumentSource = copyFactory.createLocalCopy(in); |
| } |
| |
| /** |
| * Returns the DOM of the given document source (that must be an HTML stream) |
| * and the report of eventual fixes applied on it. |
| * |
| * @param extractionParameters parameters to be used during extraction. |
| * @return document report. |
| * @throws IOException if an error occurs during data access. |
| * @throws ValidatorException if an error occurs during validation. |
| */ |
| private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters) |
| throws IOException, ValidatorException { |
| if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) { |
| ensureHasLocalCopy(); |
| final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() ); |
| is.mark(Integer.MAX_VALUE); |
| final String candidateEncoding = getParserEncoding(); |
| is.reset(); |
| final TagSoupParser tagSoupParser = new TagSoupParser( |
| is, |
| documentIRI.stringValue(), |
| candidateEncoding |
| ); |
| if(extractionParameters.isValidate()) { |
| documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() ); |
| } else { |
| documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() ); |
| } |
| tagSoupDOMRelatedParameters = extractionParameters; |
| } |
| return documentReport; |
| } |
| |
| /** |
| * Detects the encoding of the local document source input stream. |
| * |
| * @return a valid encoding value. |
| */ |
| private String detectEncoding() { |
| try { |
| ensureHasLocalCopy(); |
| InputStream is = new BufferedInputStream(localDocumentSource.openInputStream()); |
| String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType()); |
| is.close(); |
| return encoding; |
| } catch (Exception e) { |
| throw new RuntimeException("An error occurred while trying to detect the input encoding.", e); |
| } |
| } |
| |
| /** |
| * This function verifies if the <i>candidateSub</i> list of strings |
| * is a prefix of <i>list</i>. |
| * |
| * @param list a list of strings. |
| * @param candidateSub a list of strings. |
| * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>, |
| * <code>false</code> otherwise. |
| */ |
| private boolean subPath(String[] list, String[] candidateSub) { |
| if(candidateSub.length > list.length) { |
| return false; |
| } |
| for(int i = 0; i < candidateSub.length; i++) { |
| if( ! candidateSub[i].equals(list[i])) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Adds for every resource root node a page domain triple. |
| * |
| * @param resourceRoots list of resource roots. |
| * @param context extraction context to produce triples. |
| * @throws ExtractionException |
| */ |
| private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context) |
| throws ExtractionException { |
| try { |
| // Add source Web domains to every resource root. |
| String domain; |
| try { |
| domain = new java.net.URI(in.getDocumentIRI()).getHost(); |
| } catch (URISyntaxException urise) { |
| throw new IllegalArgumentException( |
| "An error occurred while extracting the host from the document IRI.", |
| urise |
| ); |
| } |
| if (domain != null) { |
| for (ResourceRoot resourceRoot : resourceRoots) { |
| output.receiveTriple( |
| resourceRoot.getRoot(), |
| vSINDICE.getProperty(SINDICE.DOMAIN), |
| SimpleValueFactory.getInstance().createLiteral(domain), |
| null, |
| context |
| ); |
| } |
| } |
| } catch (TripleHandlerException e) { |
| throw new ExtractionException("Error while writing triple triple.", e); |
| } finally { |
| try { |
| output.closeContext(context); |
| } catch (TripleHandlerException e) { |
| throw new ExtractionException("Error while closing context.", e); |
| } |
| } |
| } |
| |
| /** |
| * @return an extraction context specific for consolidation triples. |
| */ |
| private ExtractionContext createExtractionContext(String defaultLanguage) { |
| return new ExtractionContext( |
| "consolidation-extractor", |
| documentIRI, |
| defaultLanguage, |
| UUID.randomUUID().toString() |
| ); |
| } |
| |
| /** |
| * Detect the nesting relationship among different |
| * Microformats and explicit them adding connection triples. |
| * |
| * @param resourceRoots |
| * @param propertyPaths |
| * @param context |
| * @throws TripleHandlerException |
| */ |
| private void addNestingRelationship( |
| List<ResourceRoot> resourceRoots, |
| List<PropertyPath> propertyPaths, |
| ExtractionContext context |
| ) throws TripleHandlerException { |
| ResourceRoot currentResourceRoot; |
| PropertyPath currentPropertyPath; |
| for (int r = 0; r < resourceRoots.size(); r++) { |
| currentResourceRoot = resourceRoots.get(r); |
| for (int p = 0; p < propertyPaths.size(); p++) { |
| currentPropertyPath = propertyPaths.get(p); |
| Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor(); |
| Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor(); |
| // Avoid wrong nesting relationships. |
| if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) { |
| continue; |
| } |
| // Avoid self declaring relationships |
| if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) { |
| continue; |
| } |
| if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) { |
| createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context); |
| } |
| } |
| } |
| } |
| |
| /** |
| * This method consolidates the graphs extracted from the same document. |
| * In particular it adds: |
| * <ul> |
| * <li>for every microformat root node a triple indicating the original Web page domain;</li> |
| * <li>triples indicating the nesting relationship among a microformat root and property paths of |
| * other nested microformats. |
| * </li> |
| * </ul> |
| * @param resourceRoots list of RDF nodes representing roots of |
| * extracted microformat graphs and the corresponding HTML paths. |
| * @param propertyPaths list of RDF nodes representing property subjects, property IRIs and the HTML paths |
| * from which such properties have been extracted. |
| * @param addDomainTriples |
| * @param output a triple handler event collector. |
| * @return |
| * @throws ExtractionException |
| */ |
| private ExtractionContext consolidateResources( |
| List<ResourceRoot> resourceRoots, |
| List<PropertyPath> propertyPaths, |
| boolean addDomainTriples, |
| TripleHandler output, |
| String defaultLanguage |
| ) throws ExtractionException { |
| final ExtractionContext context = createExtractionContext(defaultLanguage); |
| |
| try { |
| output.openContext(context); |
| } catch (TripleHandlerException e) { |
| throw new ExtractionException( |
| String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), |
| e |
| ); |
| } |
| |
| try { |
| if(addDomainTriples) { |
| addDomainTriplesPerResourceRoots(resourceRoots, context); |
| } |
| addNestingRelationship(resourceRoots, propertyPaths, context); |
| } catch (TripleHandlerException the) { |
| throw new ExtractionException("Error while writing triple triple.", the); |
| } finally { |
| try { |
| output.closeContext(context); |
| } catch (TripleHandlerException e) { |
| throw new ExtractionException("Error while closing context.", e); |
| } |
| } |
| |
| return context; |
| } |
| |
| /** |
| * This method consolidates the graphs extracted from the same document. |
| * In particular it adds: |
| * <ul> |
| * <li>for every microformat root node a triple indicating the original Web page domain;</li> |
| * </ul> |
| * @param resourceRoots list of RDF nodes representing roots of |
| * extracted microformat graphs and the corresponding HTML paths. |
| * from which such properties have been extracted. |
| * @param addDomainTriples |
| * @param output a triple handler event collector. |
| * @return |
| * @throws ExtractionException |
| */ |
| private ExtractionContext consolidateResources( |
| List<ResourceRoot> resourceRoots, |
| boolean addDomainTriples, |
| TripleHandler output, |
| String defaultLanguage |
| ) throws ExtractionException { |
| final ExtractionContext context = createExtractionContext(defaultLanguage); |
| |
| try { |
| output.openContext(context); |
| } catch (TripleHandlerException e) { |
| throw new ExtractionException( |
| String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), |
| e |
| ); |
| } |
| |
| try { |
| if(addDomainTriples) { |
| addDomainTriplesPerResourceRoots(resourceRoots, context); |
| } |
| } finally { |
| try { |
| output.closeContext(context); |
| } catch (TripleHandlerException the) { |
| throw new ExtractionException("Error while closing context.", the); |
| } |
| } |
| |
| return context; |
| } |
| |
| /** |
| * Adds metadata triples containing the number of extracted triples |
| * and the extraction timestamp. |
| * |
| * @param context |
| * @throws TripleHandlerException |
| */ |
| private void addExtractionTimeSizeMetaTriples(ExtractionContext context) |
| throws TripleHandlerException { |
| // adding extraction date |
| String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date()); |
| output.receiveTriple( |
| SimpleValueFactory.getInstance().createIRI(documentIRI.toString()), |
| vSINDICE.getProperty(SINDICE.DATE), |
| SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow), |
| null, |
| context |
| ); |
| |
| // adding number of extracted triples |
| int numberOfTriples = 0; |
| CompositeTripleHandler cth = (CompositeTripleHandler) output; |
| for (TripleHandler th : cth.getChilds()) { |
| if (th instanceof CountingTripleHandler) { |
| numberOfTriples = ((CountingTripleHandler) th).getCount(); |
| } |
| } |
| output.receiveTriple( |
| SimpleValueFactory.getInstance().createIRI(documentIRI.toString()), |
| vSINDICE.getProperty(SINDICE.SIZE), |
| SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself |
| null, |
| context |
| ); |
| } |
| |
| /** |
| * Creates a nesting relationship triple. |
| * |
| * @param from the property containing the nested microformat. |
| * @param to the root to the nested microformat. |
| * @param th the triple handler. |
| * @param ec the extraction context used to add such information. |
| * @throws org.apache.any23.writer.TripleHandlerException |
| */ |
| private void createNestingRelationship( |
| PropertyPath from, |
| ResourceRoot to, |
| TripleHandler th, |
| ExtractionContext ec |
| ) throws TripleHandlerException { |
| final BNode fromObject = from.getObject(); |
| final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() ); |
| BNode bnode = RDFUtils.getBNode(bNodeHash); |
| th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec ); |
| th.receiveTriple( |
| bnode, |
| vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), |
| from.getObject() == null ? to.getRoot() : from.getObject(), |
| null, |
| ec |
| ); |
| th.receiveTriple( |
| from.getSubject(), |
| vSINDICE.getProperty(SINDICE.NESTING), |
| bnode, |
| null, |
| ec |
| ); |
| } |
| |
| /** |
| * Entity detection report. |
| */ |
| private static class SingleExtractionReport { |
| private final Collection<IssueReport.Issue> issues; |
| private final List<ResourceRoot> resourceRoots; |
| private final List<PropertyPath> propertyPaths; |
| private final boolean touched; |
| |
| public SingleExtractionReport( |
| Collection<IssueReport.Issue> issues, |
| List<ResourceRoot> resourceRoots, |
| List<PropertyPath> propertyPaths, |
| boolean wasTouched |
| ) { |
| this.issues = issues; |
| this.resourceRoots = resourceRoots; |
| this.propertyPaths = propertyPaths; |
| this.touched = wasTouched; |
| } |
| } |
| |
| } |