blob: 2be7d6db44273b27fb25a3580038a92a58d2bb73 [file] [log] [blame]
/*
* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deri.any23;
import org.deri.any23.extractor.ExtractionException;
import org.deri.any23.extractor.ExtractorFactory;
import org.deri.any23.extractor.ExtractorGroup;
import org.deri.any23.extractor.ExtractorRegistry;
import org.deri.any23.extractor.SingleDocumentExtraction;
import org.deri.any23.http.AcceptHeaderBuilder;
import org.deri.any23.http.DefaultHTTPClient;
import org.deri.any23.http.HTTPClient;
import org.deri.any23.mime.MIMEType;
import org.deri.any23.mime.MIMETypeDetector;
import org.deri.any23.mime.TikaMIMETypeDetector;
import org.deri.any23.source.DocumentSource;
import org.deri.any23.source.FileDocumentSource;
import org.deri.any23.source.HTTPDocumentSource;
import org.deri.any23.source.LocalCopyFactory;
import org.deri.any23.source.MemCopyFactory;
import org.deri.any23.source.StringDocumentSource;
import org.deri.any23.writer.TripleHandler;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
/**
* A facade with convenience methods for typical <i>Any23</i> extraction
* operations.
*
* @author Richard Cyganiak (richard@cyganiak.de)
* @author Michele Mostarda (michele.mostarda@gmail.com)
*/
public class Any23 {
// NOTE: there's also a version string in build.xml and pom.xml, they should match.
public static final String VERSION = "0.2.2";
private final ExtractorGroup factories;
private LocalCopyFactory streamCache;
private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(); // Can be overridden by setter.
private String userAgent = null;
private HTTPClient httpClient = new DefaultHTTPClient();
private boolean httpClientInitialized = false;
/**
* Constructor.
*/
public Any23() {
this((String[]) null);
}
/**
* Constructor that allows the specification of a list of extractors.
*
* @param extractorNames list of extractor's names.
*/
public Any23(String... extractorNames) {
factories = (extractorNames == null)
? ExtractorRegistry.getInstance().getExtractorGroup()
: ExtractorRegistry.getInstance().getExtractorGroup(Arrays.asList(extractorNames));
setCacheFactory(new MemCopyFactory());
}
/**
* Sets the <i>HTTP Header User Agent</i>,
* see <i>RFC 2616-14.43</i>.
*
* @param userAgent text describing the user agent.
*/
public void setHTTPUserAgent(String userAgent) {
if(userAgent == null || userAgent.trim().length() == 0) {
throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
}
if (httpClientInitialized) {
throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
}
this.userAgent = userAgent;
}
/**
* Returns the <i>HTTP Header User Agent</i>,
* see <i>RFC 2616-14.43</i>.
*
* @return text describing the user agent.
*/
public String getHTTPUserAgent() {
return this.userAgent;
}
/**
* Allows to set the {@link org.deri.any23.http.HTTPClient} implementation
* used to retrieve contents. The default instance is {@link org.deri.any23.http.DefaultHTTPClient}.
*
* @param httpClient a valid client instance.
* @throws IllegalStateException if invoked after client has been initialized.
*/
public void setHTTPClient(HTTPClient httpClient) {
if(httpClient == null) {
throw new NullPointerException("httpClient cannot be null.");
}
if (httpClientInitialized) {
throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
}
this.httpClient = httpClient;
}
/**
* Returns the current {@link org.deri.any23.http.HTTPClient} implementation.
*
* @return instance of HTTPClient.
* @throws IOException if the HTTP client has not initialized.
*/
public HTTPClient getHTTPClient() throws IOException {
if (!httpClientInitialized) {
if (userAgent == null) {
throw new IOException("Must call " + Any23.class.getSimpleName() +
".setHTTPUserAgent(String) before extracting from HTTP URI");
}
httpClient.init(userAgent, getAcceptHeader());
httpClientInitialized = true;
}
return httpClient;
}
/**
* Allows to set a {@link org.deri.any23.source.LocalCopyFactory} instance.
*
* @param cache valid cache instance.
*/
public void setCacheFactory(LocalCopyFactory cache) {
if(cache == null) {
throw new NullPointerException("cache cannot be null.");
}
this.streamCache = cache;
}
/**
* Allows to set an instance of {@link org.deri.any23.mime.MIMETypeDetector}.
*
* @param detector a valid detector instance, if <code>null</code> all the detectors
* will be used.
*/
public void setMIMETypeDetector(MIMETypeDetector detector) {
this.mimeTypeDetector = detector;
}
/**
* Performs metadata extraction on the <code>in</code> string
* associated to the <code>documentURI</code> URI, declaring
* <code>contentType</code> and <code>encoding</code>.
* The generated events are sent to the specified <code>outputHandler</code>.
*
* @param in raw data to be analyzed.
* @param documentURI URI from which the raw data has been extracted.
* @param contentType declared data content type.
* @param encoding declared data encoding.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException
* @throws ExtractionException
*/
public boolean extract(
String in,
String documentURI,
String contentType,
String encoding,
TripleHandler outputHandler
) throws IOException, ExtractionException {
return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
}
/**
* Performs metadata extraction on the <code>in</code> string
* associated to the <code>documentURI</code> URI, sending the generated
* events to the specified <code>outputHandler</code>.
*
* @param in raw data to be analyzed.
* @param documentURI URI from which the raw data has been extracted.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException
* @throws ExtractionException
*/
public boolean extract(String in, String documentURI, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(new StringDocumentSource(in, documentURI), outputHandler);
}
/**
* Performs metadata extraction from the content of the given <code>file</code>
* sending the generated events to the specified <code>outputHandler</code>.
*
* @param file file containing raw data.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException
* @throws ExtractionException
*/
public boolean extract(File file, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(new FileDocumentSource(file), outputHandler);
}
/**
* Performs metadata extraction from the content of the given <code>documentURI</code>
* sending the generated events to the specified <code>outputHandler</code>.
* If the <i>URI</i> is replied with a redirect, the last will be followed.
*
* @param documentURI the URI from which retrieve document.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException
* @throws ExtractionException
*/
public boolean extract(String documentURI, TripleHandler outputHandler)
throws IOException, ExtractionException {
try {
if (documentURI.toLowerCase().startsWith("file:")) {
return extract(new FileDocumentSource(new File(new URI(documentURI))), outputHandler);
}
if (documentURI.toLowerCase().startsWith("http:") || documentURI.toLowerCase().startsWith("https:")) {
return extract(new HTTPDocumentSource(getHTTPClient(), documentURI), outputHandler);
}
throw new ExtractionException("Not a valid absolute URI: " + documentURI);
} catch (URISyntaxException ex) {
throw new ExtractionException(ex);
}
}
/**
* Performs metadata extraction from the content of the given
* <code>in</code> document source, sending the generated events
* to the specified <code>outputHandler</code>.
*
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @param encoding explicit encoding see
* <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException
* @throws ExtractionException
*/
public boolean extract(DocumentSource in, TripleHandler outputHandler, String encoding)
throws IOException, ExtractionException {
SingleDocumentExtraction ex = new SingleDocumentExtraction(in, factories, outputHandler);
ex.setMIMETypeDetector(mimeTypeDetector);
ex.setLocalCopyFactory(streamCache);
ex.setParserEncoding(encoding);
ex.run();
outputHandler.close();
return ex.hasMatchingExtractors();
}
/**
* Performs metadata extraction from the content of the given
* <code>in</code> document source, sending the generated events
* to the specified <code>outputHandler</code>.
*
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException
* @throws ExtractionException
*/
public boolean extract(DocumentSource in, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(in, outputHandler, null);
}
private String getAcceptHeader() {
Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
for (ExtractorFactory<?> factory : factories) {
mimeTypes.addAll(factory.getSupportedMIMETypes());
}
return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
}
}