blob: 4c9b7d7d10213150ff3df5931a7b8229ec0bc216 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.mime;
import org.apache.any23.extractor.csv.CSVReaderBuilder;
import org.apache.any23.mime.purifier.Purifier;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;
/**
* Implementation of {@link MIMETypeDetector} based on
* <a href="http://tika.apache.org/">Apache Tika</a>.
*
* @author Michele Mostarda (michele.mostarda@gmail.com)
* @author Davide Palmisano (dpalmisano@gmail.com)
*/
public class TikaMIMETypeDetector implements MIMETypeDetector {
private Purifier purifier;
public static final String CSV_MIMETYPE = "text/csv";
public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
/**
* N3 patterns.
*/
private static final Pattern[] N3_PATTERNS = {
Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\." ), // * IRI IRI .
Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\." ), // * IRI BNODE .
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\." ), // * IRI LLITERAL .
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.") // * IRI TLITERAL .
};
/**
* N-Quads patterns.
*/
private static final Pattern[] NQUADS_PATTERNS = {
Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\." ), // * IRI IRI IRI .
Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\." ), // * IRI BNODE IRI .
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\." ), // * IRI LLITERAL IRI .
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.") // * IRI TLITERAL IRI .
};
private static volatile TikaConfig config;
private static volatile Tika tika;
private static volatile MimeTypes types;
/**
* Checks if the stream contains the <i>N3</i> triple patterns.
*
* @param is input stream to be verified.
* @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
* @throws IOException if there is an error checking the {@link java.io.InputStream}
*/
public static boolean checkN3Format(InputStream is) throws IOException {
return findPattern(N3_PATTERNS, '.', is);
}
/**
* Checks if the stream contains the <i>NQuads</i> patterns.
*
* @param is input stream to be verified.
* @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
* @throws IOException if there is an error checking the {@link java.io.InputStream}
*/
public static boolean checkNQuadsFormat(InputStream is) throws IOException {
return findPattern(NQUADS_PATTERNS, '.', is);
}
/**
* Checks if the stream contains <i>Turtle</i> triple patterns.
*
* @param is input stream to be verified.
* @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise.
* @throws IOException if there is an error checking the {@link java.io.InputStream}
*/
public static boolean checkTurtleFormat(InputStream is) throws IOException {
String sample = extractDataSample(is, '.');
RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
turtleParser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
ByteArrayInputStream bais = new ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
try {
turtleParser.parse(bais, "");
return true;
} catch (Exception e) {
return false;
}
}
/**
* Checks if the stream contains a valid <i>CSV</i> pattern.
*
* @param is input stream to be verified.
* @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise.
* @throws IOException if there is an error checking the {@link java.io.InputStream}
*/
public static boolean checkCSVFormat(InputStream is) throws IOException {
return CSVReaderBuilder.isCSV(is);
}
/**
* Tries to apply one of the given patterns on a sample of the input stream.
*
* @param patterns the patterns to apply.
* @param delimiterChar the delimiter of the sample.
* @param is the input stream to sample.
* @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise.
* @throws IOException if there is an error finding the pattern within
* the {@link java.io.InputStream}
*/
private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is)
throws IOException {
String sample = extractDataSample(is, delimiterChar);
for (Pattern pattern : patterns) {
if (pattern.matcher(sample).find()) {
return true;
}
}
return false;
}
/**
* Extracts a sample data from the input stream, from the current
* mark to the first <i>breakChar</i> char.
*
* @param is the input stream to sample.
* @param breakChar the char to break to sample.
* @return the sample string.
* @throws IOException if an error occurs during sampling.
*/
private static String extractDataSample(InputStream is, char breakChar) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
StringBuilder sb = new StringBuilder();
final int MAX_SIZE = 1024 * 2;
int c;
boolean insideBlock = false;
int read = 0;
br.mark(MAX_SIZE);
try {
while ((c = br.read()) != -1) {
read++;
if (read > MAX_SIZE) {
break;
}
if ('<' == c) {
insideBlock = true;
} else if ('>' == c) {
insideBlock = false;
} else if ('"' == c) {
insideBlock = !insideBlock;
}
sb.append((char) c);
if (!insideBlock && breakChar == c) {
break;
}
}
} finally {
is.reset();
br.reset();
}
return sb.toString();
}
public TikaMIMETypeDetector(Purifier purifier) {
this.purifier = purifier;
if (config == null || types == null || tika == null) {
synchronized (TikaMIMETypeDetector.class) {
if (config == null) {
InputStream is = getResourceAsStream();
try {
config = new TikaConfig(is);
} catch (Exception e) {
throw new RuntimeException("Error while loading Tika configuration.", e);
}
}
if (types == null) {
types = config.getMimeRepository();
}
if (tika == null) {
tika = new Tika(config);
}
}
}
}
public TikaMIMETypeDetector() {
this(new WhiteSpacesPurifier());
}
/**
* Estimates the <code>MIME</code> type of the content of input file.
* The <i>input</i> stream must be resettable.
*
* @param fileName name of the data source.
* @param input <code>null</code> or a <i>resettable</i> input stream containing data.
* @param mimeTypeFromMetadata mimetype declared in metadata.
* @return the supposed mime type or <code>null</code> if nothing appropriate found.
* @throws IllegalArgumentException if <i>input</i> is not <code>null</code> and is not resettable.
*/
public MIMEType guessMIMEType(
String fileName,
InputStream input,
MIMEType mimeTypeFromMetadata
) {
if (input != null) {
try {
this.purifier.purify(input);
} catch (IOException e) {
throw new RuntimeException("Error while purifying the provided input", e);
}
}
final Metadata meta = new Metadata();
if (mimeTypeFromMetadata != null)
meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
if (fileName != null)
meta.set(Metadata.RESOURCE_NAME_KEY, fileName);
String type;
try {
final String mt = guessMimeTypeByInputAndMeta(input, meta);
if (input == null || !MimeTypes.OCTET_STREAM.equals(mt)) {
type = mt;
} else {
if (checkN3Format(input)) {
type = RDFFormat.N3.getDefaultMIMEType();
} else if (checkNQuadsFormat(input)) {
type = RDFFormat.NQUADS.getDefaultMIMEType();
} else if (checkTurtleFormat(input)) {
type = RDFFormat.TURTLE.getDefaultMIMEType();
} else if (checkCSVFormat(input)) {
type = CSV_MIMETYPE;
} else {
type = MimeTypes.OCTET_STREAM;
}
}
} catch (IOException ioe) {
throw new RuntimeException("Error while retrieving mime type.", ioe);
}
return MIMEType.parse(type);
}
/**
* Loads the <code>Tika</code> configuration file.
*
* @return the input stream containing the configuration.
*/
private InputStream getResourceAsStream() {
InputStream result;
result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
if (result == null) {
try {
result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
} catch (SecurityException e) {
//fall through
}
if (result == null) {
result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
}
}
return result;
}
/**
* Automatically detects the MIME type of a document based on magic
* markers in the stream prefix and any given metadata hints.
* <p/>
* The given stream is expected to support marks, so that this method
* can reset the stream to the position it was in before this method
* was called.
*
* @param stream document stream
* @param metadata metadata hints
* @return MIME type of the document
* @throws IOException if the document stream could not be read
*/
private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata)
throws IOException {
if (stream != null) {
final String type = tika.detect(stream);
if (type != null && !isGenericMIMEType(type)) {
return type;
}
}
// Determines the MIMEType based on Content-Type hint if available.
final String contentType = metadata.get(Metadata.CONTENT_TYPE);
String candidateMIMEType = null;
if (contentType != null) {
try {
MimeType type = types.forName(contentType);
if (type != null) {
candidateMIMEType = type.getName();
if (!isPlainMIMEType(candidateMIMEType)) {
return candidateMIMEType;
}
}
} catch (MimeTypeException mte) {
// Malformed ocntent-type value, ignore.
}
}
// Determines the MIMEType based on resource name hint if available.
final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (resourceName != null) {
String type = tika.detect(resourceName);
if (type != null && !type.equals(MimeTypes.OCTET_STREAM)) {
return type;
}
}
// Finally, use the default type if no matches found
if (candidateMIMEType != null) {
return candidateMIMEType;
} else {
return MimeTypes.OCTET_STREAM;
}
}
private boolean isPlainMIMEType(String type) {
return
type.equals(MimeTypes.OCTET_STREAM)
||
type.equals(MimeTypes.PLAIN_TEXT);
}
private boolean isGenericMIMEType(String type) {
return
isPlainMIMEType(type)
||
type.equals(MimeTypes.XML);
}
}