any23-core/src/main/java/org/deri/any23/Any23.java - any23 - Git at Google

 /*
  * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.deri.any23;

 import org.deri.any23.extractor.ExtractionException;
 import org.deri.any23.extractor.ExtractorFactory;
 import org.deri.any23.extractor.ExtractorGroup;
 import org.deri.any23.extractor.ExtractorRegistry;
 import org.deri.any23.extractor.SingleDocumentExtraction;
 import org.deri.any23.http.AcceptHeaderBuilder;
 import org.deri.any23.http.DefaultHTTPClient;
 import org.deri.any23.http.HTTPClient;
 import org.deri.any23.mime.MIMEType;
 import org.deri.any23.mime.MIMETypeDetector;
 import org.deri.any23.mime.TikaMIMETypeDetector;
 import org.deri.any23.source.DocumentSource;
 import org.deri.any23.source.FileDocumentSource;
 import org.deri.any23.source.HTTPDocumentSource;
 import org.deri.any23.source.LocalCopyFactory;
 import org.deri.any23.source.MemCopyFactory;
 import org.deri.any23.source.StringDocumentSource;
 import org.deri.any23.writer.TripleHandler;

 import java.io.File;
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;


 /**
  * A facade with convenience methods for typical <i>Any23</i> extraction
  * operations.
  *
  * @author Richard Cyganiak (richard@cyganiak.de)
  * @author Michele Mostarda (michele.mostarda@gmail.com)
  */
 public class Any23 {

     // NOTE: there's also a version string in build.xml and pom.xml, they should match.
     public static final String VERSION = "0.2.2";

     private final ExtractorGroup factories;
     private LocalCopyFactory streamCache;
     private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(); // Can be overridden by setter.
     private String userAgent = null;
     private HTTPClient httpClient = new DefaultHTTPClient();
     private boolean httpClientInitialized = false;

     /**
      * Constructor.
      */
     public Any23() {
         this((String[]) null);
     }

     /**
      * Constructor that allows the specification of a list of extractors.
      *
      * @param extractorNames list of extractor's names.
      */
     public Any23(String... extractorNames) {
         factories = (extractorNames == null)
                 ? ExtractorRegistry.getInstance().getExtractorGroup()
                 : ExtractorRegistry.getInstance().getExtractorGroup(Arrays.asList(extractorNames));
         setCacheFactory(new MemCopyFactory());
     }

     /**
      * Sets the <i>HTTP Header User Agent</i>,
      * see <i>RFC 2616-14.43</i>.
      *
      * @param userAgent text describing the user agent.
      */
     public void setHTTPUserAgent(String userAgent) {
         if(userAgent == null || userAgent.trim().length() == 0) {
             throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
         }
         if (httpClientInitialized) {
             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
         }
         this.userAgent = userAgent;
     }

     /**
      * Returns the <i>HTTP Header User Agent</i>,
      * see <i>RFC 2616-14.43</i>.
      *
      * @return text describing the user agent.
      */
     public String getHTTPUserAgent() {
         return this.userAgent;
     }

     /**
      * Allows to set the {@link org.deri.any23.http.HTTPClient} implementation
      * used to retrieve contents. The default instance is {@link org.deri.any23.http.DefaultHTTPClient}.
      *
      * @param httpClient a valid client instance.
      * @throws IllegalStateException if invoked after client has been initialized.
      */
     public void setHTTPClient(HTTPClient httpClient) {
         if(httpClient == null) {
             throw new NullPointerException("httpClient cannot be null.");
         }
         if (httpClientInitialized) {
             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
         }
         this.httpClient = httpClient;
     }

     /**
      * Returns the current {@link org.deri.any23.http.HTTPClient} implementation.
      *
      * @return instance of HTTPClient.
      * @throws IOException if the HTTP client has not initialized.
      */
     public HTTPClient getHTTPClient() throws IOException {
         if (!httpClientInitialized) {
             if (userAgent == null) {
                 throw new IOException("Must call " + Any23.class.getSimpleName() +
                         ".setHTTPUserAgent(String) before extracting from HTTP URI");
             }
             httpClient.init(userAgent, getAcceptHeader());
             httpClientInitialized = true;
         }
         return httpClient;
     }

     /**
      * Allows to set a {@link org.deri.any23.source.LocalCopyFactory} instance.
      *
      * @param cache valid cache instance.
      */
     public void setCacheFactory(LocalCopyFactory cache) {
         if(cache == null) {
             throw new NullPointerException("cache cannot be null.");
         }
         this.streamCache = cache;
     }

     /**
      * Allows to set an instance of {@link org.deri.any23.mime.MIMETypeDetector}.
      *
      * @param detector a valid detector instance, if <code>null</code> all the detectors
      *        will be used.
      */
     public void setMIMETypeDetector(MIMETypeDetector detector) {
         this.mimeTypeDetector = detector;
     }

     /**
      * Performs metadata extraction on the <code>in</code> string
      * associated to the <code>documentURI</code> URI, declaring
      * <code>contentType</code> and <code>encoding</code>.
      * The generated events are sent to the specified <code>outputHandler</code>.
      *
      * @param in raw data to be analyzed.
      * @param documentURI URI from which the raw data has been extracted.
      * @param contentType declared data content type.
      * @param encoding declared data encoding.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public boolean extract(
             String in,
             String documentURI,
             String contentType,
             String encoding,
             TripleHandler outputHandler
     ) throws IOException, ExtractionException {
         return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
     }

     /**
      * Performs metadata extraction on the <code>in</code> string
      * associated to the <code>documentURI</code> URI, sending the generated
      * events to the specified <code>outputHandler</code>.
      *
      * @param in raw data to be analyzed.
      * @param documentURI URI from which the raw data has been extracted.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public boolean extract(String in, String documentURI, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(new StringDocumentSource(in, documentURI), outputHandler);
     }

     /**
      * Performs metadata extraction from the content of the given <code>file</code>
      * sending the generated events to the specified <code>outputHandler</code>.
      *
      * @param file file containing raw data.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public boolean extract(File file, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(new FileDocumentSource(file), outputHandler);
     }

     /**
      * Performs metadata extraction from the content of the given <code>documentURI</code>
      * sending the generated events to the specified <code>outputHandler</code>.
      * If the <i>URI</i> is replied with a redirect, the last will be followed.
      *
      * @param documentURI the URI from which retrieve document.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public boolean extract(String documentURI, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         try {
             if (documentURI.toLowerCase().startsWith("file:")) {
                 return extract(new FileDocumentSource(new File(new URI(documentURI))), outputHandler);
             }
             if (documentURI.toLowerCase().startsWith("http:") || documentURI.toLowerCase().startsWith("https:")) {
                 return extract(new HTTPDocumentSource(getHTTPClient(), documentURI), outputHandler);
             }
             throw new ExtractionException("Not a valid absolute URI: " + documentURI);
         } catch (URISyntaxException ex) {
             throw new ExtractionException(ex);
         }
     }

     /**
      * Performs metadata extraction from the content of the given
      * <code>in</code> document source, sending the generated events
      * to the specified <code>outputHandler</code>.
      *
      * @param in the input document source.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @param encoding explicit encoding see
      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public boolean extract(DocumentSource in, TripleHandler outputHandler, String encoding)
     throws IOException, ExtractionException {
         SingleDocumentExtraction ex = new SingleDocumentExtraction(in, factories, outputHandler);
         ex.setMIMETypeDetector(mimeTypeDetector);
         ex.setLocalCopyFactory(streamCache);
         ex.setParserEncoding(encoding);
         ex.run();
         outputHandler.close();
         return ex.hasMatchingExtractors();
     }

     /**
      * Performs metadata extraction from the content of the given
      * <code>in</code> document source, sending the generated events
      * to the specified <code>outputHandler</code>.
      *
      * @param in the input document source.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public boolean extract(DocumentSource in, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(in, outputHandler, null);
     }

     private String getAcceptHeader() {
         Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
         for (ExtractorFactory<?> factory : factories) {
             mimeTypes.addAll(factory.getSupportedMIMETypes());
         }
         return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
     }

 }
	/*
	* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.deri.any23;

	import org.deri.any23.extractor.ExtractionException;
	import org.deri.any23.extractor.ExtractorFactory;
	import org.deri.any23.extractor.ExtractorGroup;
	import org.deri.any23.extractor.ExtractorRegistry;
	import org.deri.any23.extractor.SingleDocumentExtraction;
	import org.deri.any23.http.AcceptHeaderBuilder;
	import org.deri.any23.http.DefaultHTTPClient;
	import org.deri.any23.http.HTTPClient;
	import org.deri.any23.mime.MIMEType;
	import org.deri.any23.mime.MIMETypeDetector;
	import org.deri.any23.mime.TikaMIMETypeDetector;
	import org.deri.any23.source.DocumentSource;
	import org.deri.any23.source.FileDocumentSource;
	import org.deri.any23.source.HTTPDocumentSource;
	import org.deri.any23.source.LocalCopyFactory;
	import org.deri.any23.source.MemCopyFactory;
	import org.deri.any23.source.StringDocumentSource;
	import org.deri.any23.writer.TripleHandler;

	import java.io.File;
	import java.io.IOException;
	import java.net.URI;
	import java.net.URISyntaxException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;


	/**
	* A facade with convenience methods for typical <i>Any23</i> extraction
	* operations.
	*
	* @author Richard Cyganiak (richard@cyganiak.de)
	* @author Michele Mostarda (michele.mostarda@gmail.com)
	*/
	public class Any23 {

	// NOTE: there's also a version string in build.xml and pom.xml, they should match.
	public static final String VERSION = "0.2.2";

	private final ExtractorGroup factories;
	private LocalCopyFactory streamCache;
	private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(); // Can be overridden by setter.
	private String userAgent = null;
	private HTTPClient httpClient = new DefaultHTTPClient();
	private boolean httpClientInitialized = false;

	/**
	* Constructor.
	*/
	public Any23() {
	this((String[]) null);
	}

	/**
	* Constructor that allows the specification of a list of extractors.
	*
	* @param extractorNames list of extractor's names.
	*/
	public Any23(String... extractorNames) {
	factories = (extractorNames == null)
	? ExtractorRegistry.getInstance().getExtractorGroup()
	: ExtractorRegistry.getInstance().getExtractorGroup(Arrays.asList(extractorNames));
	setCacheFactory(new MemCopyFactory());
	}

	/**
	* Sets the <i>HTTP Header User Agent</i>,
	* see <i>RFC 2616-14.43</i>.
	*
	* @param userAgent text describing the user agent.
	*/
	public void setHTTPUserAgent(String userAgent) {
	if(userAgent == null \|\| userAgent.trim().length() == 0) {
	throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
	}
	if (httpClientInitialized) {
	throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
	}
	this.userAgent = userAgent;
	}

	/**
	* Returns the <i>HTTP Header User Agent</i>,
	* see <i>RFC 2616-14.43</i>.
	*
	* @return text describing the user agent.
	*/
	public String getHTTPUserAgent() {
	return this.userAgent;
	}

	/**
	* Allows to set the {@link org.deri.any23.http.HTTPClient} implementation
	* used to retrieve contents. The default instance is {@link org.deri.any23.http.DefaultHTTPClient}.
	*
	* @param httpClient a valid client instance.
	* @throws IllegalStateException if invoked after client has been initialized.
	*/
	public void setHTTPClient(HTTPClient httpClient) {
	if(httpClient == null) {
	throw new NullPointerException("httpClient cannot be null.");
	}
	if (httpClientInitialized) {
	throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
	}
	this.httpClient = httpClient;
	}

	/**
	* Returns the current {@link org.deri.any23.http.HTTPClient} implementation.
	*
	* @return instance of HTTPClient.
	* @throws IOException if the HTTP client has not initialized.
	*/
	public HTTPClient getHTTPClient() throws IOException {
	if (!httpClientInitialized) {
	if (userAgent == null) {
	throw new IOException("Must call " + Any23.class.getSimpleName() +
	".setHTTPUserAgent(String) before extracting from HTTP URI");
	}
	httpClient.init(userAgent, getAcceptHeader());
	httpClientInitialized = true;
	}
	return httpClient;
	}

	/**
	* Allows to set a {@link org.deri.any23.source.LocalCopyFactory} instance.
	*
	* @param cache valid cache instance.
	*/
	public void setCacheFactory(LocalCopyFactory cache) {
	if(cache == null) {
	throw new NullPointerException("cache cannot be null.");
	}
	this.streamCache = cache;
	}

	/**
	* Allows to set an instance of {@link org.deri.any23.mime.MIMETypeDetector}.
	*
	* @param detector a valid detector instance, if <code>null</code> all the detectors
	* will be used.
	*/
	public void setMIMETypeDetector(MIMETypeDetector detector) {
	this.mimeTypeDetector = detector;
	}

	/**
	* Performs metadata extraction on the <code>in</code> string
	* associated to the <code>documentURI</code> URI, declaring
	* <code>contentType</code> and <code>encoding</code>.
	* The generated events are sent to the specified <code>outputHandler</code>.
	*
	* @param in raw data to be analyzed.
	* @param documentURI URI from which the raw data has been extracted.
	* @param contentType declared data content type.
	* @param encoding declared data encoding.
	* @param outputHandler handler responsible for collecting of the extracted metadata.
	* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
	* @throws IOException
	* @throws ExtractionException
	*/
	public boolean extract(
	String in,
	String documentURI,
	String contentType,
	String encoding,
	TripleHandler outputHandler
	) throws IOException, ExtractionException {
	return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
	}

	/**
	* Performs metadata extraction on the <code>in</code> string
	* associated to the <code>documentURI</code> URI, sending the generated
	* events to the specified <code>outputHandler</code>.
	*
	* @param in raw data to be analyzed.
	* @param documentURI URI from which the raw data has been extracted.
	* @param outputHandler handler responsible for collecting of the extracted metadata.
	* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
	* @throws IOException
	* @throws ExtractionException
	*/
	public boolean extract(String in, String documentURI, TripleHandler outputHandler)
	throws IOException, ExtractionException {
	return extract(new StringDocumentSource(in, documentURI), outputHandler);
	}

	/**
	* Performs metadata extraction from the content of the given <code>file</code>
	* sending the generated events to the specified <code>outputHandler</code>.
	*
	* @param file file containing raw data.
	* @param outputHandler handler responsible for collecting of the extracted metadata.
	* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
	* @throws IOException
	* @throws ExtractionException
	*/
	public boolean extract(File file, TripleHandler outputHandler)
	throws IOException, ExtractionException {
	return extract(new FileDocumentSource(file), outputHandler);
	}

	/**
	* Performs metadata extraction from the content of the given <code>documentURI</code>
	* sending the generated events to the specified <code>outputHandler</code>.
	* If the <i>URI</i> is replied with a redirect, the last will be followed.
	*
	* @param documentURI the URI from which retrieve document.
	* @param outputHandler handler responsible for collecting of the extracted metadata.
	* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
	* @throws IOException
	* @throws ExtractionException
	*/
	public boolean extract(String documentURI, TripleHandler outputHandler)
	throws IOException, ExtractionException {
	try {
	if (documentURI.toLowerCase().startsWith("file:")) {
	return extract(new FileDocumentSource(new File(new URI(documentURI))), outputHandler);
	}
	if (documentURI.toLowerCase().startsWith("http:") \|\| documentURI.toLowerCase().startsWith("https:")) {
	return extract(new HTTPDocumentSource(getHTTPClient(), documentURI), outputHandler);
	}
	throw new ExtractionException("Not a valid absolute URI: " + documentURI);
	} catch (URISyntaxException ex) {
	throw new ExtractionException(ex);
	}
	}

	/**
	* Performs metadata extraction from the content of the given
	* <code>in</code> document source, sending the generated events
	* to the specified <code>outputHandler</code>.
	*
	* @param in the input document source.
	* @param outputHandler handler responsible for collecting of the extracted metadata.
	* @param encoding explicit encoding see
	* <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
	* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
	* @throws IOException
	* @throws ExtractionException
	*/
	public boolean extract(DocumentSource in, TripleHandler outputHandler, String encoding)
	throws IOException, ExtractionException {
	SingleDocumentExtraction ex = new SingleDocumentExtraction(in, factories, outputHandler);
	ex.setMIMETypeDetector(mimeTypeDetector);
	ex.setLocalCopyFactory(streamCache);
	ex.setParserEncoding(encoding);
	ex.run();
	outputHandler.close();
	return ex.hasMatchingExtractors();
	}

	/**
	* Performs metadata extraction from the content of the given
	* <code>in</code> document source, sending the generated events
	* to the specified <code>outputHandler</code>.
	*
	* @param in the input document source.
	* @param outputHandler handler responsible for collecting of the extracted metadata.
	* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
	* @throws IOException
	* @throws ExtractionException
	*/
	public boolean extract(DocumentSource in, TripleHandler outputHandler)
	throws IOException, ExtractionException {
	return extract(in, outputHandler, null);
	}

	private String getAcceptHeader() {
	Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
	for (ExtractorFactory<?> factory : factories) {
	mimeTypes.addAll(factory.getSupportedMIMETypes());
	}
	return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
	}

	}