| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| |
| package org.apache.oodt.cas.metadata.util; |
| |
| //JDK imports |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileNotFoundException; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.net.URL; |
| import java.util.logging.Level; |
| import java.util.logging.Logger; |
| |
| //APACHE imports |
| import org.apache.tika.Tika; |
| import org.apache.tika.detect.DefaultDetector; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.mime.MimeType; |
| import org.apache.tika.mime.MimeTypeException; |
| import org.apache.tika.mime.MimeTypes; |
| import org.apache.tika.mime.MimeTypesFactory; |
| |
| /** |
| * @author mattmann |
| * @author bfoster |
| * |
| * <p> |
| * This is a facade class to insulate CAS Metadata from its underlying Mime Type |
| * substrate library, <a href="http://tika.apache.org/">Apache Tika</a>. |
| * Any mime handling code should be placed in this utility class, and hidden |
| * from the CAS Metadata classes that rely on it. |
| * </p> |
| */ |
| public final class MimeTypeUtils { |
| |
| private static final String SEPARATOR = ";"; |
| |
| /* our Tika mime type registry */ |
| private MimeTypes mimeTypes; |
| |
| private Tika tika; |
| |
| /* whether or not magic should be employed or not */ |
| private boolean mimeMagic; |
| |
| /* static resource path for the mimeTypesFile */ |
| public final static String MIME_FILE_RES_PATH = "tika-mimetypes.xml"; |
| |
| /* our log stream */ |
| private static final Logger LOG = Logger.getLogger(MimeTypeUtils.class |
| .getName()); |
| |
| public MimeTypeUtils() { |
| this(MimeTypeUtils.class.getResourceAsStream(MIME_FILE_RES_PATH), true); |
| } |
| |
| public MimeTypeUtils(String filePath) throws FileNotFoundException { |
| this(filePath, true); |
| } |
| |
| public MimeTypeUtils(String filePath, boolean magic) |
| throws FileNotFoundException { |
| this(new FileInputStream(filePath), magic); |
| } |
| |
| public MimeTypeUtils(InputStream mimeIs, boolean magic) { |
| try { |
| this.mimeTypes = MimeTypesFactory.create(mimeIs); |
| this.mimeMagic = magic; |
| this.tika = new Tika(new DefaultDetector(this.mimeTypes)); |
| }catch (Exception e) { |
| LOG.log(Level.SEVERE, "Failed to load MimeType Registry : " + e.getMessage(), e); |
| } |
| } |
| |
| /** |
| * Cleans a {@link MimeType} name by removing out the actual |
| * {@link MimeType}, from a string of the form: |
| * |
| * <pre> |
| * <primary type>/<sub type> ; < optional params |
| * </pre> |
| * |
| * @param origType |
| * The original mime type string to be cleaned. |
| * @return The primary type, and subtype, concatenated, e.g., the actual |
| * mime type. |
| */ |
| public static String cleanMimeType(String origType) { |
| if (origType == null) |
| return null; |
| |
| // take the origType and split it on ';' |
| String[] tokenizedMimeType = origType.split(SEPARATOR); |
| if (tokenizedMimeType.length > 1) { |
| // there was a ';' in there, take the first value |
| return tokenizedMimeType[0]; |
| } else { |
| // there wasn't a ';', so just return the orig type |
| return origType; |
| } |
| } |
| |
| /** |
| * Same as {@link #autoResolveContentType(String, String, byte[])}, but |
| * this method passes <code>null</code> as the initial type. |
| * |
| * @param url |
| * The String URL to use to check glob patterns. |
| * @param data |
| * The byte data to potentially use in magic detection. |
| * @return The String {@link MimeType}. |
| */ |
| public String autoResolveContentType(String url, byte[] data) { |
| return autoResolveContentType(null, url, data); |
| } |
| |
| /** |
| * A facade interface to trying all the possible mime type resolution |
| * strategies available within Tika. First, the mime type provided in |
| * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. |
| * Then the cleaned mime type is looked up in the underlying Tika |
| * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} |
| * is found, then that mime type is used, otherwise {@link URL} resolution |
| * is used to try and determine the mime type. If that means is |
| * unsuccessful, and if <code>mime.type.magic</code> is enabled in |
| * {@link NutchConfiguration}, then mime type magic resolution is used to |
| * try and obtain a better-than-the-default approximation of the |
| * {@link MimeType}. |
| * |
| * @param typeName |
| * The original mime type, returned from a {@link ProtocolOutput}. |
| * @param url |
| * The given {@link URL}, that Nutch was trying to crawl. |
| * @param data |
| * The byte data, returned from the crawl, if any. |
| * @return The correctly, automatically guessed {@link MimeType} name. |
| */ |
| public String autoResolveContentType(String typeName, String url, |
| byte[] data) { |
| MimeType type = null; |
| String cleanedMimeType = null; |
| |
| try { |
| cleanedMimeType = MimeTypeUtils.cleanMimeType(typeName) != null ? this.mimeTypes |
| .forName(MimeTypeUtils.cleanMimeType(typeName)).getName() |
| : null; |
| } catch (MimeTypeException mte) { |
| // Seems to be a malformed mime type name... |
| } |
| |
| // first try to get the type from the cleaned type name |
| try { |
| type = cleanedMimeType != null ? this.mimeTypes |
| .forName(cleanedMimeType) : null; |
| } catch (MimeTypeException e) { |
| type = null; |
| } |
| |
| // if returned null, or if it's the default type then try url resolution |
| if (type == null |
| || (type.getName().equals(MimeTypes.OCTET_STREAM))) { |
| // If no mime-type header, or cannot find a corresponding registered |
| // mime-type, then guess a mime-type from the url pattern |
| try { |
| type = mimeTypes.forName(tika.detect(url)) != null ? mimeTypes.forName(tika.detect(url)) : type; |
| } catch (Exception e) { |
| // MimeTypeException or IOException from tika.detect. Ignore. |
| } |
| } |
| |
| // if magic is enabled use mime magic to guess if the mime type returned |
| // from the magic guess is different than the one that's already set so |
| // far |
| // if it is, and it's not the default mime type, then go with the mime |
| // type |
| // returned by the magic |
| if (this.mimeMagic) { |
| MimeType magicType; |
| try { |
| magicType = mimeTypes.forName(tika.detect(data)); |
| } catch (Exception e) { |
| magicType = null; |
| } |
| if (magicType != null |
| && !magicType.getName().equals(MimeTypes.OCTET_STREAM) |
| && type != null |
| && !type.getName().equals(magicType.getName())) { |
| // If magic enabled and the current mime type differs from that |
| // of the |
| // one returned from the magic, take the magic mimeType |
| type = magicType; |
| } |
| |
| // if type is STILL null after all the resolution strategies, go for |
| // the |
| // default type |
| if (type == null) { |
| try { |
| type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM); |
| } catch (Exception ignore) { |
| } |
| } |
| } |
| |
| return type.getName(); |
| } |
| |
| /** |
| * Facade interface to Tika's underlying |
| * {@link tika.detect(String)} method. |
| * |
| * @param url |
| * A string representation of the document {@link URL} to sense |
| * the {@link MimeType} for. |
| * @return An appropriate {@link MimeType}, identified from the given |
| * Document url in string form. |
| */ |
| public String getMimeType(URL url) { |
| try { |
| return tika.detect(url); |
| } catch (Exception e) { |
| return null; |
| } |
| } |
| |
| /** |
| * A facade interface to Tika's underlying {@link org.apache.tika.tika.detect(String)} |
| * method. |
| * |
| * @param name |
| * The name of a valid {@link MimeType} in the Tika mime |
| * registry. |
| * @return The object representation of the {@link MimeType}, if it exists, |
| * or null otherwise. |
| */ |
| public String getMimeType(String name) { |
| try { |
| return tika.detect(name); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| return null; |
| } |
| } |
| |
| /** |
| * Facade interface to Tika's underlying {@link org.apache.tika.Tika#detect(File)} |
| * method. |
| * |
| * @param f |
| * The {@link File} to sense the {@link MimeType} for. |
| * @return The {@link MimeType} of the given {@link File}, or null if it |
| * cannot be determined. |
| */ |
| public String getMimeType(File f) { |
| try { |
| return tika.detect(f); |
| } catch (Exception e) { |
| System.err.println("\n\n\n"); |
| e.printStackTrace(); |
| System.err.println("\n\n\n"); |
| return null; |
| } |
| } |
| |
| /** |
| * Utility method to act as a facade to |
| * {@link MimeTypes#getMimeType(byte[])}. |
| * |
| * @param data |
| * The byte data to get the {@link MimeType} for. |
| * @return The String representation of the resolved {@link MimeType}, or |
| * null if a suitable {@link MimeType} is not found. |
| */ |
| public String getMimeTypeByMagic(byte[] data) { |
| try { |
| return tika.detect(data); |
| } catch (Exception e) { |
| return null; |
| } |
| } |
| |
| public String getDescriptionForMimeType(String mimeType) { |
| try { |
| return this.mimeTypes.forName(mimeType).getDescription(); |
| }catch (Exception e) { |
| LOG.log(Level.WARNING, "Failed to get description for mimetype " |
| + mimeType + " : " + e.getMessage()); |
| return null; |
| } |
| } |
| |
| public String getSuperTypeForMimeType(String mimeType) { |
| try { |
| MediaType mediaType = this.mimeTypes.getMediaTypeRegistry().getSupertype(this.mimeTypes.forName(mimeType).getType()); |
| if (mediaType != null) |
| return mediaType.getType() + "/" + mediaType.getSubtype(); |
| else |
| return null; |
| }catch (Exception e) { |
| LOG.log(Level.WARNING, "Failed to get super-type for mimetype " |
| + mimeType + " : " + e.getMessage()); |
| return null; |
| } |
| } |
| |
| /** |
| * @return the mimeMagic |
| */ |
| public boolean isMimeMagic() { |
| return mimeMagic; |
| } |
| |
| /** |
| * @param mimeMagic the mimeMagic to set |
| */ |
| public void setMimeMagic(boolean mimeMagic) { |
| this.mimeMagic = mimeMagic; |
| } |
| |
| public static byte[] readMagicHeader(InputStream stream) throws IOException { |
| return readMagicHeader(stream, 1024); |
| } |
| |
| public static byte[] readMagicHeader(InputStream stream, int headerByteSize) |
| throws IOException { |
| if (stream == null) { |
| throw new IllegalArgumentException("InputStream is missing"); |
| } |
| |
| byte[] bytes = new byte[headerByteSize]; |
| int totalRead = 0; |
| |
| int lastRead = stream.read(bytes); |
| while (lastRead != -1) { |
| totalRead += lastRead; |
| if (totalRead == bytes.length) { |
| return bytes; |
| } |
| lastRead = stream.read(bytes, totalRead, bytes.length - totalRead); |
| } |
| |
| byte[] shorter = new byte[totalRead]; |
| System.arraycopy(bytes, 0, shorter, 0, totalRead); |
| return shorter; |
| } |
| |
| } |