blob: 30f46fc05b12a127d81dda5dfdf0c0e59e62961c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.extractor.ExtractorGroup;
import org.apache.any23.extractor.ExtractorRegistryImpl;
import org.apache.any23.extractor.SingleDocumentExtraction;
import org.apache.any23.extractor.SingleDocumentExtractionReport;
import org.apache.any23.http.AcceptHeaderBuilder;
import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.DefaultHTTPClientConfiguration;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.mime.MIMEType;
import org.apache.any23.mime.MIMETypeDetector;
import org.apache.any23.mime.TikaMIMETypeDetector;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.FileDocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.LocalCopyFactory;
import org.apache.any23.source.MemCopyFactory;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.TripleHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Locale;
/**
* A facade with convenience methods for typical <i>Any23</i> extraction
* operations.
*
* @author Richard Cyganiak (richard@cyganiak.de)
* @author Michele Mostarda (michele.mostarda@gmail.com)
*/
public class Any23 {
/**
* Any23 core library version.
* NOTE: there's also a version string in pom.xml, they should match.
*/
public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
/**
* Default HTTP User Agent defined in default configuration.
*/
public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
"any23.http.user.agent.default"
);
protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
private final Configuration configuration;
private final String defaultUserAgent;
private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
private HTTPClient httpClient = new DefaultHTTPClient();
private boolean httpClientInitialized = false;
private final ExtractorGroup factories;
private LocalCopyFactory streamCache;
private String userAgent;
/**
* Constructor that allows the specification of a
* custom configuration and of a list of extractors.
*
* @param configuration configuration used to build the <i>Any23</i> instance.
* @param extractorGroup the group of extractors to be applied.
*/
public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
if (configuration == null)
throw new NullPointerException("configuration must be not null.");
this.configuration = configuration;
if (logger.isDebugEnabled()) {
logger.debug(configuration.getConfigurationDump());
}
this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
this.factories = (extractorGroup == null)
? ExtractorRegistryImpl.getInstance().getExtractorGroup()
: extractorGroup;
setCacheFactory(new MemCopyFactory());
}
/**
* Constructor that allows the specification of a list of extractors.
*
* @param extractorGroup the group of extractors to be applied.
*/
public Any23(ExtractorGroup extractorGroup) {
this(DefaultConfiguration.singleton(), extractorGroup);
}
/**
* Constructor that allows the specification of a
* custom configuration and of list of extractor names.
*
* @param configuration a {@link Configuration} object
* @param extractorNames list of extractor's names.
*/
public Any23(Configuration configuration, String... extractorNames) {
this(configuration, extractorNames == null ? null :
ExtractorRegistryImpl.getInstance().getExtractorGroup(Arrays.asList(extractorNames))
);
}
/**
* Constructor that allows the specification of a list of extractor names.
*
* @param extractorNames list of extractor's names.
*/
public Any23(String... extractorNames) {
this(DefaultConfiguration.singleton(), extractorNames);
}
/**
* Constructor accepting {@link Configuration}.
* @param configuration a {@link Configuration} object
*/
public Any23(Configuration configuration) {
this(configuration, (String[]) null);
}
/**
* Constructor with default configuration.
*/
public Any23() {
this(DefaultConfiguration.singleton());
}
/**
* Sets the <i>HTTP Header User Agent</i>,
* see <i>RFC 2616-14.43</i>.
*
* @param userAgent text describing the user agent.
*/
public void setHTTPUserAgent(String userAgent) {
if (httpClientInitialized) {
throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
}
if (userAgent == null) {
userAgent = defaultUserAgent;
}
if (userAgent.trim().length() == 0) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid user agent: '%s'", userAgent));
}
this.userAgent = userAgent;
}
/**
* Returns the <i>HTTP Header User Agent</i>,
* see <i>RFC 2616-14.43</i>.
*
* @return text describing the user agent.
*/
public String getHTTPUserAgent() {
return this.userAgent;
}
/**
* Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
* used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
*
* @param httpClient a valid client instance.
* @throws IllegalStateException if invoked after client has been initialized.
*/
public void setHTTPClient(HTTPClient httpClient) {
if (httpClient == null) {
throw new NullPointerException("httpClient cannot be null.");
}
if (httpClientInitialized) {
throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
}
this.httpClient = httpClient;
}
/**
* Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
*
* @return instance of HTTPClient.
* @throws IOException if the HTTP client has not initialized.
*/
public HTTPClient getHTTPClient() throws IOException {
if (!httpClientInitialized) {
if (userAgent == null) {
throw new IOException("Must call " + Any23.class.getSimpleName() +
".setHTTPUserAgent(String) before extracting from HTTP IRI");
}
httpClient.init(new DefaultHTTPClientConfiguration(this.getAcceptHeader()));
httpClientInitialized = true;
}
return httpClient;
}
/**
* Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
*
* @param cache valid cache instance.
*/
public void setCacheFactory(LocalCopyFactory cache) {
if (cache == null) {
throw new NullPointerException("cache cannot be null.");
}
this.streamCache = cache;
}
/**
* Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
*
* @param detector a valid detector instance, if <code>null</code> all the detectors
* will be used.
*/
public void setMIMETypeDetector(MIMETypeDetector detector) {
this.mimeTypeDetector = detector;
}
/**
* <p>Returns the most appropriate {@link DocumentSource} for the given<code>documentIRI</code>.</p>
* <p><b>N.B.</b> <code>documentIRI's</code> <i>should</i> contain a protocol.
* E.g. <b>http:</b>, <b>https:</b>, <b>file:</b>
* </p>
*
* @param documentIRI the document <i>IRI</i>.
* @return a new instance of DocumentSource.
* @throws URISyntaxException if an error occurs while parsing the <code>documentIRI</code> as a <i>IRI</i>.
* @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
*/
public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
if (documentIRI == null)
throw new NullPointerException("documentIRI cannot be null.");
if (documentIRI.toLowerCase(Locale.ROOT).startsWith("file:")) {
return new FileDocumentSource(new File(new URI(documentIRI)));
}
if (documentIRI.toLowerCase(Locale.ROOT).startsWith("http:") || documentIRI.toLowerCase(Locale.ROOT).startsWith("https:")) {
return new HTTPDocumentSource(getHTTPClient(), documentIRI);
}
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Unsupported protocol for document IRI: '%s' . "
+ "Check that document IRI contains a protocol.", documentIRI)
);
}
/**
* Performs metadata extraction from the content of the given
* <code>in</code> document source, sending the generated events
* to the specified <code>outputHandler</code>.
*
* @param eps the extraction parameters to be applied.
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @param encoding explicit encoding see
* <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(
ExtractionParameters eps,
DocumentSource in,
TripleHandler outputHandler,
String encoding
) throws IOException, ExtractionException {
final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
ex.setMIMETypeDetector(mimeTypeDetector);
ex.setLocalCopyFactory(streamCache);
ex.setParserEncoding(encoding);
final SingleDocumentExtractionReport sder = ex.run(eps);
return new ExtractionReport(
ex.getMatchingExtractors(),
ex.getParserEncoding(),
ex.getDetectedMIMEType(),
sder.getValidationReport(),
sder.getExtractorToIssues()
);
}
/**
* Performs metadata extraction on the <code>in</code> string
* associated to the <code>documentIRI</code> IRI, declaring
* <code>contentType</code> and <code>encoding</code>.
* The generated events are sent to the specified <code>outputHandler</code>.
*
* @param in raw data to be analyzed.
* @param documentIRI IRI from which the raw data has been extracted.
* @param contentType declared data content type.
* @param encoding declared data encoding.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(
String in,
String documentIRI,
String contentType,
String encoding,
TripleHandler outputHandler
) throws IOException, ExtractionException {
return extract(new StringDocumentSource(in, documentIRI, contentType, encoding), outputHandler);
}
/**
* Performs metadata extraction on the <code>in</code> string
* associated to the <code>documentIRI</code> IRI, sending the generated
* events to the specified <code>outputHandler</code>.
*
* @param in raw data to be analyzed.
* @param documentIRI IRI from which the raw data has been extracted.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(String in, String documentIRI, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(new StringDocumentSource(in, documentIRI), outputHandler);
}
/**
* Performs metadata extraction from the content of the given <code>file</code>
* sending the generated events to the specified <code>outputHandler</code>.
*
* @param file file containing raw data.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(File file, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(new FileDocumentSource(file), outputHandler);
}
/**
* Performs metadata extraction from the content of the given <code>documentIRI</code>
* sending the generated events to the specified <code>outputHandler</code>.
* If the <i>IRI</i> is replied with a redirect, the last will be followed.
*
* @param eps the parameters to be applied to the extraction.
* @param documentIRI the IRI from which retrieve document.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(ExtractionParameters eps, String documentIRI, TripleHandler outputHandler)
throws IOException, ExtractionException {
try {
return extract(eps, createDocumentSource(documentIRI), outputHandler);
} catch (URISyntaxException ex) {
throw new ExtractionException("Error while extracting data from document IRI.", ex);
}
}
/**
* Performs metadata extraction from the content of the given <code>documentIRI</code>
* sending the generated events to the specified <code>outputHandler</code>.
* If the <i>IRI</i> is replied with a redirect, the last will be followed.
*
* @param documentIRI the IRI from which retrieve document.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(String documentIRI, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract((ExtractionParameters) null, documentIRI, outputHandler);
}
/**
* Performs metadata extraction from the content of the given
* <code>in</code> document source, sending the generated events
* to the specified <code>outputHandler</code>.
*
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @param encoding explicit encoding see
* <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
throws IOException, ExtractionException {
return extract(null, in, outputHandler, encoding);
}
/**
* Performs metadata extraction from the content of the given
* <code>in</code> document source, sending the generated events
* to the specified <code>outputHandler</code>.
*
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(null, in, outputHandler, null);
}
/**
* Performs metadata extraction from the content of the given
* <code>in</code> document source, sending the generated events
* to the specified <code>outputHandler</code>.
*
* @param eps the parameters to be applied for the extraction phase.
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
* @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*/
public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(eps, in, outputHandler, null);
}
private String getAcceptHeader() {
Collection<MIMEType> mimeTypes = new ArrayList<>();
for (ExtractorFactory<?> factory : factories) {
mimeTypes.addAll(factory.getSupportedMIMETypes());
}
return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
}
}