any23-core/src/main/java/org/deri/any23/extractor/SingleDocumentExtraction.java - any23 - Git at Google

 /**
  * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *          http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  */

 package org.deri.any23.extractor;

 import org.deri.any23.encoding.EncodingDetector;
 import org.deri.any23.encoding.TikaEncodingDetector;
 import org.deri.any23.extractor.Extractor.BlindExtractor;
 import org.deri.any23.extractor.Extractor.ContentExtractor;
 import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.deri.any23.extractor.html.TagSoupParser;
 import org.deri.any23.mime.MIMEType;
 import org.deri.any23.mime.MIMETypeDetector;
 import org.deri.any23.rdf.Any23ValueFactoryWrapper;
 import org.deri.any23.source.DocumentSource;
 import org.deri.any23.source.LocalCopyFactory;
 import org.deri.any23.source.MemCopyFactory;
 import org.deri.any23.writer.TripleHandler;
 import org.openrdf.model.URI;
 import org.openrdf.model.impl.ValueFactoryImpl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Collections;

 /**
  * This class acts as facade where all the extractors were called on a single document.
  */
 public class SingleDocumentExtraction {

     private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);

     private final DocumentSource in;

     private URI documentURI;

     private final ExtractorGroup extractors;

     private final TripleHandler output;

     private final EncodingDetector encoderDetector;

     private LocalCopyFactory copyFactory = null;

     private DocumentSource localDocumentSource = null;

     private MIMETypeDetector detector = null;

     private ExtractorGroup matchingExtractors = null;

     private MIMEType detectedMIMEType = null;

     private Document tagSoupDOM = null;

     private String parserEncoding = null;

     public SingleDocumentExtraction(DocumentSource in, ExtractorGroup extractors, TripleHandler output) {
         this.in = in;
         this.extractors = extractors;
         this.output = output;
         this.encoderDetector = new TikaEncodingDetector();
     }

     public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
         this(in, new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
                 output);
         this.setMIMETypeDetector(null);
     }

     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
         this.copyFactory = copyFactory;
     }

     public void setMIMETypeDetector(MIMETypeDetector detector) {
         this.detector = detector;
     }

     /**
      *
      * Triggers the execution of all the {@link org.deri.any23.extractor.Extractor} registered to this class.
      *
      * @throws ExtractionException
      * @throws IOException
      */
     public void run() throws ExtractionException, IOException {
         ensureHasLocalCopy();
         try {
             this.documentURI = new Any23ValueFactoryWrapper(
                     ValueFactoryImpl.getInstance()).createURI(in.getDocumentURI()
             );
         } catch (Exception ex) {
             throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex);
         }
         log.info("Processing " + this.documentURI);
         filterExtractorsByMIMEType();

         StringBuffer sb = new StringBuffer("Extractors ");
         for (ExtractorFactory<?> factory : matchingExtractors) {
             sb.append(factory.getExtractorName());
             sb.append(' ');
         }
         sb.append("match " + documentURI);
         log.debug(sb.toString());

         // Invoke all extractors.
         output.startDocument(documentURI);
         output.setContentLength(in.getContentLength());
         for (ExtractorFactory<?> factory : matchingExtractors) {
             runExtractor(factory.createExtractor());
         }
         output.endDocument(documentURI);
     }

     public String getDetectedMIMEType() throws IOException {
         filterExtractorsByMIMEType();
         return detectedMIMEType.toString();
     }

     public boolean hasMatchingExtractors() throws IOException {
         filterExtractorsByMIMEType();
         return !matchingExtractors.isEmpty();
     }

     public String getParserEncoding() {
         return this.parserEncoding;
     }

     public void setParserEncoding(String encoding) {
         this.parserEncoding = encoding;
         tagSoupDOM = null;
     }

     private void filterExtractorsByMIMEType()
     throws IOException {
         if (matchingExtractors != null) return;  // has already been run.

         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
             matchingExtractors = extractors;
             return;
         }
         ensureHasLocalCopy();
         detectedMIMEType = detector.guessMIMEType(
                 java.net.URI.create(documentURI.stringValue()).getPath(),
                 localDocumentSource.openInputStream(),
                 MIMEType.parse(localDocumentSource.getContentType())
         );
         log.debug("detected media type: " + detectedMIMEType);
         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
     }

     /**
      * Triggers the execution of a specific {@link org.deri.any23.extractor.Extractor}.
      *
      * @param extractor the {@link org.deri.any23.extractor.Extractor} to be executed.
      * @throws ExtractionException
      * @throws IOException
      */
     private void runExtractor(Extractor<?> extractor)
     throws ExtractionException, IOException {
         log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI);
         long startTime = System.currentTimeMillis();
         ExtractionResultImpl result = new ExtractionResultImpl(documentURI, extractor, output);
         try {
             if (extractor instanceof BlindExtractor) {
                 ((BlindExtractor) extractor).run(documentURI, documentURI, result);
             } else if (extractor instanceof ContentExtractor) {
                 ensureHasLocalCopy();
                 ((ContentExtractor) extractor).run(localDocumentSource.openInputStream(), documentURI, result);
             } else if (extractor instanceof TagSoupDOMExtractor) {
                 ((TagSoupDOMExtractor) extractor).run(getTagSoupDOM(), documentURI, result);
             } else {
                 throw new RuntimeException("Extractor type not supported: " + extractor.getClass());
             }
         } catch (ExtractionException ex) {
             log.info(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
             throw ex;
         } finally {
             result.close();
             long elapsed = System.currentTimeMillis() - startTime;
             log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
         }
     }

     private void ensureHasLocalCopy() throws IOException {
         if (localDocumentSource != null) return;
         if (in.isLocal()) {
             localDocumentSource = in;
             return;
         }
         if (copyFactory == null) {
             copyFactory = new MemCopyFactory();
         }
         localDocumentSource = copyFactory.createLocalCopy(in);
     }

     private Document getTagSoupDOM() throws IOException {
         if (tagSoupDOM == null) {
             ensureHasLocalCopy();
             final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
             is.mark(Integer.MAX_VALUE);
             final String candidateEncoding = getCandidateEncoding(is);
             is.reset();
             tagSoupDOM = new TagSoupParser(
                     is,
                     documentURI.stringValue(),
                     candidateEncoding
             ).getDOM();
         }
         return tagSoupDOM;
     }

     private String getCandidateEncoding(InputStream is) throws IOException {
         if(this.parserEncoding != null) {
             return this.parserEncoding;
         }
         return this.encoderDetector.guessEncoding(is);

     }

 }
	/**
	* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*/

	package org.deri.any23.extractor;

	import org.deri.any23.encoding.EncodingDetector;
	import org.deri.any23.encoding.TikaEncodingDetector;
	import org.deri.any23.extractor.Extractor.BlindExtractor;
	import org.deri.any23.extractor.Extractor.ContentExtractor;
	import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
	import org.deri.any23.extractor.html.TagSoupParser;
	import org.deri.any23.mime.MIMEType;
	import org.deri.any23.mime.MIMETypeDetector;
	import org.deri.any23.rdf.Any23ValueFactoryWrapper;
	import org.deri.any23.source.DocumentSource;
	import org.deri.any23.source.LocalCopyFactory;
	import org.deri.any23.source.MemCopyFactory;
	import org.deri.any23.writer.TripleHandler;
	import org.openrdf.model.URI;
	import org.openrdf.model.impl.ValueFactoryImpl;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.Document;
	import java.io.BufferedInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.Collections;

	/**
	* This class acts as facade where all the extractors were called on a single document.
	*/
	public class SingleDocumentExtraction {

	private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);

	private final DocumentSource in;

	private URI documentURI;

	private final ExtractorGroup extractors;

	private final TripleHandler output;

	private final EncodingDetector encoderDetector;

	private LocalCopyFactory copyFactory = null;

	private DocumentSource localDocumentSource = null;

	private MIMETypeDetector detector = null;

	private ExtractorGroup matchingExtractors = null;

	private MIMEType detectedMIMEType = null;

	private Document tagSoupDOM = null;

	private String parserEncoding = null;

	public SingleDocumentExtraction(DocumentSource in, ExtractorGroup extractors, TripleHandler output) {
	this.in = in;
	this.extractors = extractors;
	this.output = output;
	this.encoderDetector = new TikaEncodingDetector();
	}

	public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
	this(in, new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
	output);
	this.setMIMETypeDetector(null);
	}

	public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
	this.copyFactory = copyFactory;
	}

	public void setMIMETypeDetector(MIMETypeDetector detector) {
	this.detector = detector;
	}

	/**
	*
	* Triggers the execution of all the {@link org.deri.any23.extractor.Extractor} registered to this class.
	*
	* @throws ExtractionException
	* @throws IOException
	*/
	public void run() throws ExtractionException, IOException {
	ensureHasLocalCopy();
	try {
	this.documentURI = new Any23ValueFactoryWrapper(
	ValueFactoryImpl.getInstance()).createURI(in.getDocumentURI()
	);
	} catch (Exception ex) {
	throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex);
	}
	log.info("Processing " + this.documentURI);
	filterExtractorsByMIMEType();

	StringBuffer sb = new StringBuffer("Extractors ");
	for (ExtractorFactory<?> factory : matchingExtractors) {
	sb.append(factory.getExtractorName());
	sb.append(' ');
	}
	sb.append("match " + documentURI);
	log.debug(sb.toString());

	// Invoke all extractors.
	output.startDocument(documentURI);
	output.setContentLength(in.getContentLength());
	for (ExtractorFactory<?> factory : matchingExtractors) {
	runExtractor(factory.createExtractor());
	}
	output.endDocument(documentURI);
	}

	public String getDetectedMIMEType() throws IOException {
	filterExtractorsByMIMEType();
	return detectedMIMEType.toString();
	}

	public boolean hasMatchingExtractors() throws IOException {
	filterExtractorsByMIMEType();
	return !matchingExtractors.isEmpty();
	}

	public String getParserEncoding() {
	return this.parserEncoding;
	}

	public void setParserEncoding(String encoding) {
	this.parserEncoding = encoding;
	tagSoupDOM = null;
	}

	private void filterExtractorsByMIMEType()
	throws IOException {
	if (matchingExtractors != null) return; // has already been run.

	if (detector == null \|\| extractors.allExtractorsSupportAllContentTypes()) {
	matchingExtractors = extractors;
	return;
	}
	ensureHasLocalCopy();
	detectedMIMEType = detector.guessMIMEType(
	java.net.URI.create(documentURI.stringValue()).getPath(),
	localDocumentSource.openInputStream(),
	MIMEType.parse(localDocumentSource.getContentType())
	);
	log.debug("detected media type: " + detectedMIMEType);
	matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
	}

	/**
	* Triggers the execution of a specific {@link org.deri.any23.extractor.Extractor}.
	*
	* @param extractor the {@link org.deri.any23.extractor.Extractor} to be executed.
	* @throws ExtractionException
	* @throws IOException
	*/
	private void runExtractor(Extractor<?> extractor)
	throws ExtractionException, IOException {
	log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI);
	long startTime = System.currentTimeMillis();
	ExtractionResultImpl result = new ExtractionResultImpl(documentURI, extractor, output);
	try {
	if (extractor instanceof BlindExtractor) {
	((BlindExtractor) extractor).run(documentURI, documentURI, result);
	} else if (extractor instanceof ContentExtractor) {
	ensureHasLocalCopy();
	((ContentExtractor) extractor).run(localDocumentSource.openInputStream(), documentURI, result);
	} else if (extractor instanceof TagSoupDOMExtractor) {
	((TagSoupDOMExtractor) extractor).run(getTagSoupDOM(), documentURI, result);
	} else {
	throw new RuntimeException("Extractor type not supported: " + extractor.getClass());
	}
	} catch (ExtractionException ex) {
	log.info(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
	throw ex;
	} finally {
	result.close();
	long elapsed = System.currentTimeMillis() - startTime;
	log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
	}
	}

	private void ensureHasLocalCopy() throws IOException {
	if (localDocumentSource != null) return;
	if (in.isLocal()) {
	localDocumentSource = in;
	return;
	}
	if (copyFactory == null) {
	copyFactory = new MemCopyFactory();
	}
	localDocumentSource = copyFactory.createLocalCopy(in);
	}

	private Document getTagSoupDOM() throws IOException {
	if (tagSoupDOM == null) {
	ensureHasLocalCopy();
	final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
	is.mark(Integer.MAX_VALUE);
	final String candidateEncoding = getCandidateEncoding(is);
	is.reset();
	tagSoupDOM = new TagSoupParser(
	is,
	documentURI.stringValue(),
	candidateEncoding
	).getDOM();
	}
	return tagSoupDOM;
	}

	private String getCandidateEncoding(InputStream is) throws IOException {
	if(this.parserEncoding != null) {
	return this.parserEncoding;
	}
	return this.encoderDetector.guessEncoding(is);

	}

	}