blob: dac6ad838566bc768220a651a3cbfe0867b09dff [file] [log] [blame]
/**
* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.deri.any23.extractor;
import org.deri.any23.encoding.EncodingDetector;
import org.deri.any23.encoding.TikaEncodingDetector;
import org.deri.any23.extractor.Extractor.BlindExtractor;
import org.deri.any23.extractor.Extractor.ContentExtractor;
import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
import org.deri.any23.extractor.html.TagSoupParser;
import org.deri.any23.mime.MIMEType;
import org.deri.any23.mime.MIMETypeDetector;
import org.deri.any23.rdf.Any23ValueFactoryWrapper;
import org.deri.any23.source.DocumentSource;
import org.deri.any23.source.LocalCopyFactory;
import org.deri.any23.source.MemCopyFactory;
import org.deri.any23.writer.TripleHandler;
import org.openrdf.model.URI;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
/**
* This class acts as facade where all the extractors were called on a single document.
*/
public class SingleDocumentExtraction {
private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
private final DocumentSource in;
private URI documentURI;
private final ExtractorGroup extractors;
private final TripleHandler output;
private final EncodingDetector encoderDetector;
private LocalCopyFactory copyFactory = null;
private DocumentSource localDocumentSource = null;
private MIMETypeDetector detector = null;
private ExtractorGroup matchingExtractors = null;
private MIMEType detectedMIMEType = null;
private Document tagSoupDOM = null;
private String parserEncoding = null;
public SingleDocumentExtraction(DocumentSource in, ExtractorGroup extractors, TripleHandler output) {
this.in = in;
this.extractors = extractors;
this.output = output;
this.encoderDetector = new TikaEncodingDetector();
}
public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
this(in, new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
output);
this.setMIMETypeDetector(null);
}
public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
this.copyFactory = copyFactory;
}
public void setMIMETypeDetector(MIMETypeDetector detector) {
this.detector = detector;
}
/**
*
* Triggers the execution of all the {@link org.deri.any23.extractor.Extractor} registered to this class.
*
* @throws ExtractionException
* @throws IOException
*/
public void run() throws ExtractionException, IOException {
ensureHasLocalCopy();
try {
this.documentURI = new Any23ValueFactoryWrapper(
ValueFactoryImpl.getInstance()).createURI(in.getDocumentURI()
);
} catch (Exception ex) {
throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex);
}
log.info("Processing " + this.documentURI);
filterExtractorsByMIMEType();
StringBuffer sb = new StringBuffer("Extractors ");
for (ExtractorFactory<?> factory : matchingExtractors) {
sb.append(factory.getExtractorName());
sb.append(' ');
}
sb.append("match " + documentURI);
log.debug(sb.toString());
// Invoke all extractors.
output.startDocument(documentURI);
output.setContentLength(in.getContentLength());
for (ExtractorFactory<?> factory : matchingExtractors) {
runExtractor(factory.createExtractor());
}
output.endDocument(documentURI);
}
public String getDetectedMIMEType() throws IOException {
filterExtractorsByMIMEType();
return detectedMIMEType.toString();
}
public boolean hasMatchingExtractors() throws IOException {
filterExtractorsByMIMEType();
return !matchingExtractors.isEmpty();
}
public String getParserEncoding() {
return this.parserEncoding;
}
public void setParserEncoding(String encoding) {
this.parserEncoding = encoding;
tagSoupDOM = null;
}
private void filterExtractorsByMIMEType()
throws IOException {
if (matchingExtractors != null) return; // has already been run.
if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
matchingExtractors = extractors;
return;
}
ensureHasLocalCopy();
detectedMIMEType = detector.guessMIMEType(
java.net.URI.create(documentURI.stringValue()).getPath(),
localDocumentSource.openInputStream(),
MIMEType.parse(localDocumentSource.getContentType())
);
log.debug("detected media type: " + detectedMIMEType);
matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
}
/**
* Triggers the execution of a specific {@link org.deri.any23.extractor.Extractor}.
*
* @param extractor the {@link org.deri.any23.extractor.Extractor} to be executed.
* @throws ExtractionException
* @throws IOException
*/
private void runExtractor(Extractor<?> extractor)
throws ExtractionException, IOException {
log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI);
long startTime = System.currentTimeMillis();
ExtractionResultImpl result = new ExtractionResultImpl(documentURI, extractor, output);
try {
if (extractor instanceof BlindExtractor) {
((BlindExtractor) extractor).run(documentURI, documentURI, result);
} else if (extractor instanceof ContentExtractor) {
ensureHasLocalCopy();
((ContentExtractor) extractor).run(localDocumentSource.openInputStream(), documentURI, result);
} else if (extractor instanceof TagSoupDOMExtractor) {
((TagSoupDOMExtractor) extractor).run(getTagSoupDOM(), documentURI, result);
} else {
throw new RuntimeException("Extractor type not supported: " + extractor.getClass());
}
} catch (ExtractionException ex) {
log.info(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
throw ex;
} finally {
result.close();
long elapsed = System.currentTimeMillis() - startTime;
log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
}
}
private void ensureHasLocalCopy() throws IOException {
if (localDocumentSource != null) return;
if (in.isLocal()) {
localDocumentSource = in;
return;
}
if (copyFactory == null) {
copyFactory = new MemCopyFactory();
}
localDocumentSource = copyFactory.createLocalCopy(in);
}
private Document getTagSoupDOM() throws IOException {
if (tagSoupDOM == null) {
ensureHasLocalCopy();
final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
is.mark(Integer.MAX_VALUE);
final String candidateEncoding = getCandidateEncoding(is);
is.reset();
tagSoupDOM = new TagSoupParser(
is,
documentURI.stringValue(),
candidateEncoding
).getDOM();
}
return tagSoupDOM;
}
private String getCandidateEncoding(InputStream is) throws IOException {
if(this.parserEncoding != null) {
return this.parserEncoding;
}
return this.encoderDetector.guessEncoding(is);
}
}