0.10/metadata/src/main/java/org/apache/oodt/cas/metadata/util/MimeTypeUtils.java - oodt - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */


 package org.apache.oodt.cas.metadata.util;

 //JDK imports
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.logging.Level;
 import java.util.logging.Logger;

 //APACHE imports
 import org.apache.tika.Tika;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;

 /**
  * @author mattmann
  * @author bfoster
  *
  * <p>
  * This is a facade class to insulate CAS Metadata from its underlying Mime Type
  * substrate library, <a href="http://tika.apache.org/">Apache Tika</a>.
  * Any mime handling code should be placed in this utility class, and hidden
  * from the CAS Metadata classes that rely on it.
  * </p>
  */
 public final class MimeTypeUtils {

     private static final String SEPARATOR = ";";

     /* our Tika mime type registry */
     private MimeTypes mimeTypes;

     private Tika tika;

     /* whether or not magic should be employed or not */
     private boolean mimeMagic;

     /* static resource path for the mimeTypesFile */
     public final static String MIME_FILE_RES_PATH = "tika-mimetypes.xml";

     /* our log stream */
     private static final Logger LOG = Logger.getLogger(MimeTypeUtils.class
             .getName());

     public MimeTypeUtils() {
         this(MimeTypeUtils.class.getResourceAsStream(MIME_FILE_RES_PATH), true);
     }

     public MimeTypeUtils(String filePath) throws FileNotFoundException {
         this(filePath, true);
     }

     public MimeTypeUtils(String filePath, boolean magic)
             throws FileNotFoundException {
         this(new FileInputStream(filePath), magic);
     }

     public MimeTypeUtils(InputStream mimeIs, boolean magic) {
     	try {
     		this.mimeTypes = MimeTypesFactory.create(mimeIs);
     		this.mimeMagic = magic;
     		this.tika = new Tika(new DefaultDetector(this.mimeTypes));
     	}catch (Exception e) {
     		LOG.log(Level.SEVERE, "Failed to load MimeType Registry : " + e.getMessage(), e);
     	}
     }

     /**
      * Cleans a {@link MimeType} name by removing out the actual
      * {@link MimeType}, from a string of the form:
      *
      * <pre>
      *           &lt;primary type&gt;/&lt;sub type&gt; ; &lt; optional params
      * </pre>
      *
      * @param origType
      *            The original mime type string to be cleaned.
      * @return The primary type, and subtype, concatenated, e.g., the actual
      *         mime type.
      */
     public static String cleanMimeType(String origType) {
         if (origType == null)
             return null;

         // take the origType and split it on ';'
         String[] tokenizedMimeType = origType.split(SEPARATOR);
         if (tokenizedMimeType.length > 1) {
             // there was a ';' in there, take the first value
             return tokenizedMimeType[0];
         } else {
             // there wasn't a ';', so just return the orig type
             return origType;
         }
     }

     /**
      * Same as {@link #autoResolveContentType(String, String, byte[])}, but
      * this method passes <code>null</code> as the initial type.
      *
      * @param url
      *            The String URL to use to check glob patterns.
      * @param data
      *            The byte data to potentially use in magic detection.
      * @return The String {@link MimeType}.
      */
     public String autoResolveContentType(String url, byte[] data) {
         return autoResolveContentType(null, url, data);
     }

     /**
      * A facade interface to trying all the possible mime type resolution
      * strategies available within Tika. First, the mime type provided in
      * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
      * Then the cleaned mime type is looked up in the underlying Tika
      * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
      * is found, then that mime type is used, otherwise {@link URL} resolution
      * is used to try and determine the mime type. If that means is
      * unsuccessful, and if <code>mime.type.magic</code> is enabled in
      * {@link NutchConfiguration}, then mime type magic resolution is used to
      * try and obtain a better-than-the-default approximation of the
      * {@link MimeType}.
      *
      * @param typeName
      *            The original mime type, returned from a {@link ProtocolOutput}.
      * @param url
      *            The given {@link URL}, that Nutch was trying to crawl.
      * @param data
      *            The byte data, returned from the crawl, if any.
      * @return The correctly, automatically guessed {@link MimeType} name.
      */
     public String autoResolveContentType(String typeName, String url,
             byte[] data) {
         MimeType type = null;
         String cleanedMimeType = null;

         try {
             cleanedMimeType = MimeTypeUtils.cleanMimeType(typeName) != null ? this.mimeTypes
                     .forName(MimeTypeUtils.cleanMimeType(typeName)).getName()
                     : null;
         } catch (MimeTypeException mte) {
             // Seems to be a malformed mime type name...
         }

         // first try to get the type from the cleaned type name
         try {
             type = cleanedMimeType != null ? this.mimeTypes
                     .forName(cleanedMimeType) : null;
         } catch (MimeTypeException e) {
             type = null;
         }

         // if returned null, or if it's the default type then try url resolution
         if (type == null
                 || (type.getName().equals(MimeTypes.OCTET_STREAM))) {
             // If no mime-type header, or cannot find a corresponding registered
             // mime-type, then guess a mime-type from the url pattern
             try {
                 type = mimeTypes.forName(tika.detect(url)) != null ? mimeTypes.forName(tika.detect(url)) : type;
             } catch (Exception e) {
                 // MimeTypeException or IOException from tika.detect. Ignore.
             }
         }

         // if magic is enabled use mime magic to guess if the mime type returned
         // from the magic guess is different than the one that's already set so
         // far
         // if it is, and it's not the default mime type, then go with the mime
         // type
         // returned by the magic
         if (this.mimeMagic) {
             MimeType magicType;
             try {
                 magicType =  mimeTypes.forName(tika.detect(data));
             } catch (Exception e) {
                 magicType = null;
             }
             if (magicType != null
                     && !magicType.getName().equals(MimeTypes.OCTET_STREAM)
                     && type != null
                     && !type.getName().equals(magicType.getName())) {
                 // If magic enabled and the current mime type differs from that
                 // of the
                 // one returned from the magic, take the magic mimeType
                 type = magicType;
             }

             // if type is STILL null after all the resolution strategies, go for
             // the
             // default type
             if (type == null) {
                 try {
                     type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
                 } catch (Exception ignore) {
                 }
             }
         }

         return type.getName();
     }

     /**
      * Facade interface to Tika's underlying
      * {@link tika.detect(String)} method.
      *
      * @param url
      *            A string representation of the document {@link URL} to sense
      *            the {@link MimeType} for.
      * @return An appropriate {@link MimeType}, identified from the given
      *         Document url in string form.
      */
     public String getMimeType(URL url) {
         try {
     	    return tika.detect(url);
         } catch (Exception e) {
             return null;
         }
     }

     /**
      * A facade interface to Tika's underlying {@link org.apache.tika.tika.detect(String)}
      * method.
      *
      * @param name
      *            The name of a valid {@link MimeType} in the Tika mime
      *            registry.
      * @return The object representation of the {@link MimeType}, if it exists,
      *         or null otherwise.
      */
     public String getMimeType(String name) {
         try {
             return tika.detect(name);
         } catch (Exception e) {
             e.printStackTrace();
             return null;
         }
     }

     /**
      * Facade interface to Tika's underlying {@link org.apache.tika.Tika#detect(File)}
      * method.
      *
      * @param f
      *            The {@link File} to sense the {@link MimeType} for.
      * @return The {@link MimeType} of the given {@link File}, or null if it
      *         cannot be determined.
      */
     public String getMimeType(File f) {
         try {
             return tika.detect(f);
         } catch (Exception e) {
             System.err.println("\n\n\n");
             e.printStackTrace();
             System.err.println("\n\n\n");
             return null;
         }
     }

     /**
      * Utility method to act as a facade to
      * {@link MimeTypes#getMimeType(byte[])}.
      *
      * @param data
      *            The byte data to get the {@link MimeType} for.
      * @return The String representation of the resolved {@link MimeType}, or
      *         null if a suitable {@link MimeType} is not found.
      */
     public String getMimeTypeByMagic(byte[] data) {
         try {
             return tika.detect(data);
         } catch (Exception e) {
             return null;
         }
     }

     public String getDescriptionForMimeType(String mimeType) {
     	try {
     		return this.mimeTypes.forName(mimeType).getDescription();
     	}catch (Exception e) {
     		LOG.log(Level.WARNING, "Failed to get description for mimetype "
     				+ mimeType + " : " + e.getMessage());
     		return null;
     	}
     }

     public String getSuperTypeForMimeType(String mimeType) {
     	try {
     		MediaType mediaType = this.mimeTypes.getMediaTypeRegistry().getSupertype(this.mimeTypes.forName(mimeType).getType());
     		if (mediaType != null)
     			return mediaType.getType() + "/" + mediaType.getSubtype();
     		else
     			return null;
     	}catch (Exception e) {
     		LOG.log(Level.WARNING, "Failed to get super-type for mimetype "
     				+ mimeType + " : " + e.getMessage());
     		return null;
     	}
     }

     /**
      * @return the mimeMagic
      */
     public boolean isMimeMagic() {
         return mimeMagic;
     }

     /**
      * @param mimeMagic the mimeMagic to set
      */
     public void setMimeMagic(boolean mimeMagic) {
         this.mimeMagic = mimeMagic;
     }

     public static byte[] readMagicHeader(InputStream stream) throws IOException {
     	return readMagicHeader(stream, 1024);
     }

     public static byte[] readMagicHeader(InputStream stream, int headerByteSize)
     		throws IOException {
         if (stream == null) {
             throw new IllegalArgumentException("InputStream is missing");
         }

         byte[] bytes = new byte[headerByteSize];
         int totalRead = 0;

         int lastRead = stream.read(bytes);
         while (lastRead != -1) {
             totalRead += lastRead;
             if (totalRead == bytes.length) {
                 return bytes;
             }
             lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
         }

         byte[] shorter = new byte[totalRead];
         System.arraycopy(bytes, 0, shorter, 0, totalRead);
         return shorter;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/


	package org.apache.oodt.cas.metadata.util;

	//JDK imports
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URL;
	import java.util.logging.Level;
	import java.util.logging.Logger;

	//APACHE imports
	import org.apache.tika.Tika;
	import org.apache.tika.detect.DefaultDetector;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.mime.MimeType;
	import org.apache.tika.mime.MimeTypeException;
	import org.apache.tika.mime.MimeTypes;
	import org.apache.tika.mime.MimeTypesFactory;

	/**
	* @author mattmann
	* @author bfoster
	*
	* <p>
	* This is a facade class to insulate CAS Metadata from its underlying Mime Type
	* substrate library, <a href="http://tika.apache.org/">Apache Tika</a>.
	* Any mime handling code should be placed in this utility class, and hidden
	* from the CAS Metadata classes that rely on it.
	* </p>
	*/
	public final class MimeTypeUtils {

	private static final String SEPARATOR = ";";

	/* our Tika mime type registry */
	private MimeTypes mimeTypes;

	private Tika tika;

	/* whether or not magic should be employed or not */
	private boolean mimeMagic;

	/* static resource path for the mimeTypesFile */
	public final static String MIME_FILE_RES_PATH = "tika-mimetypes.xml";

	/* our log stream */
	private static final Logger LOG = Logger.getLogger(MimeTypeUtils.class
	.getName());

	public MimeTypeUtils() {
	this(MimeTypeUtils.class.getResourceAsStream(MIME_FILE_RES_PATH), true);
	}

	public MimeTypeUtils(String filePath) throws FileNotFoundException {
	this(filePath, true);
	}

	public MimeTypeUtils(String filePath, boolean magic)
	throws FileNotFoundException {
	this(new FileInputStream(filePath), magic);
	}

	public MimeTypeUtils(InputStream mimeIs, boolean magic) {
	try {
	this.mimeTypes = MimeTypesFactory.create(mimeIs);
	this.mimeMagic = magic;
	this.tika = new Tika(new DefaultDetector(this.mimeTypes));
	}catch (Exception e) {
	LOG.log(Level.SEVERE, "Failed to load MimeType Registry : " + e.getMessage(), e);
	}
	}

	/**
	* Cleans a {@link MimeType} name by removing out the actual
	* {@link MimeType}, from a string of the form:
	*
	* <pre>
	* <primary type>/<sub type> ; < optional params
	* </pre>
	*
	* @param origType
	* The original mime type string to be cleaned.
	* @return The primary type, and subtype, concatenated, e.g., the actual
	* mime type.
	*/
	public static String cleanMimeType(String origType) {
	if (origType == null)
	return null;

	// take the origType and split it on ';'
	String[] tokenizedMimeType = origType.split(SEPARATOR);
	if (tokenizedMimeType.length > 1) {
	// there was a ';' in there, take the first value
	return tokenizedMimeType[0];
	} else {
	// there wasn't a ';', so just return the orig type
	return origType;
	}
	}

	/**
	* Same as {@link #autoResolveContentType(String, String, byte[])}, but
	* this method passes <code>null</code> as the initial type.
	*
	* @param url
	* The String URL to use to check glob patterns.
	* @param data
	* The byte data to potentially use in magic detection.
	* @return The String {@link MimeType}.
	*/
	public String autoResolveContentType(String url, byte[] data) {
	return autoResolveContentType(null, url, data);
	}

	/**
	* A facade interface to trying all the possible mime type resolution
	* strategies available within Tika. First, the mime type provided in
	* <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
	* Then the cleaned mime type is looked up in the underlying Tika
	* {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
	* is found, then that mime type is used, otherwise {@link URL} resolution
	* is used to try and determine the mime type. If that means is
	* unsuccessful, and if <code>mime.type.magic</code> is enabled in
	* {@link NutchConfiguration}, then mime type magic resolution is used to
	* try and obtain a better-than-the-default approximation of the
	* {@link MimeType}.
	*
	* @param typeName
	* The original mime type, returned from a {@link ProtocolOutput}.
	* @param url
	* The given {@link URL}, that Nutch was trying to crawl.
	* @param data
	* The byte data, returned from the crawl, if any.
	* @return The correctly, automatically guessed {@link MimeType} name.
	*/
	public String autoResolveContentType(String typeName, String url,
	byte[] data) {
	MimeType type = null;
	String cleanedMimeType = null;

	try {
	cleanedMimeType = MimeTypeUtils.cleanMimeType(typeName) != null ? this.mimeTypes
	.forName(MimeTypeUtils.cleanMimeType(typeName)).getName()
	: null;
	} catch (MimeTypeException mte) {
	// Seems to be a malformed mime type name...
	}

	// first try to get the type from the cleaned type name
	try {
	type = cleanedMimeType != null ? this.mimeTypes
	.forName(cleanedMimeType) : null;
	} catch (MimeTypeException e) {
	type = null;
	}

	// if returned null, or if it's the default type then try url resolution
	if (type == null
	\|\| (type.getName().equals(MimeTypes.OCTET_STREAM))) {
	// If no mime-type header, or cannot find a corresponding registered
	// mime-type, then guess a mime-type from the url pattern
	try {
	type = mimeTypes.forName(tika.detect(url)) != null ? mimeTypes.forName(tika.detect(url)) : type;
	} catch (Exception e) {
	// MimeTypeException or IOException from tika.detect. Ignore.
	}
	}

	// if magic is enabled use mime magic to guess if the mime type returned
	// from the magic guess is different than the one that's already set so
	// far
	// if it is, and it's not the default mime type, then go with the mime
	// type
	// returned by the magic
	if (this.mimeMagic) {
	MimeType magicType;
	try {
	magicType = mimeTypes.forName(tika.detect(data));
	} catch (Exception e) {
	magicType = null;
	}
	if (magicType != null
	&& !magicType.getName().equals(MimeTypes.OCTET_STREAM)
	&& type != null
	&& !type.getName().equals(magicType.getName())) {
	// If magic enabled and the current mime type differs from that
	// of the
	// one returned from the magic, take the magic mimeType
	type = magicType;
	}

	// if type is STILL null after all the resolution strategies, go for
	// the
	// default type
	if (type == null) {
	try {
	type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
	} catch (Exception ignore) {
	}
	}
	}

	return type.getName();
	}

	/**
	* Facade interface to Tika's underlying
	* {@link tika.detect(String)} method.
	*
	* @param url
	* A string representation of the document {@link URL} to sense
	* the {@link MimeType} for.
	* @return An appropriate {@link MimeType}, identified from the given
	* Document url in string form.
	*/
	public String getMimeType(URL url) {
	try {
	return tika.detect(url);
	} catch (Exception e) {
	return null;
	}
	}

	/**
	* A facade interface to Tika's underlying {@link org.apache.tika.tika.detect(String)}
	* method.
	*
	* @param name
	* The name of a valid {@link MimeType} in the Tika mime
	* registry.
	* @return The object representation of the {@link MimeType}, if it exists,
	* or null otherwise.
	*/
	public String getMimeType(String name) {
	try {
	return tika.detect(name);
	} catch (Exception e) {
	e.printStackTrace();
	return null;
	}
	}

	/**
	* Facade interface to Tika's underlying {@link org.apache.tika.Tika#detect(File)}
	* method.
	*
	* @param f
	* The {@link File} to sense the {@link MimeType} for.
	* @return The {@link MimeType} of the given {@link File}, or null if it
	* cannot be determined.
	*/
	public String getMimeType(File f) {
	try {
	return tika.detect(f);
	} catch (Exception e) {
	System.err.println("\n\n\n");
	e.printStackTrace();
	System.err.println("\n\n\n");
	return null;
	}
	}

	/**
	* Utility method to act as a facade to
	* {@link MimeTypes#getMimeType(byte[])}.
	*
	* @param data
	* The byte data to get the {@link MimeType} for.
	* @return The String representation of the resolved {@link MimeType}, or
	* null if a suitable {@link MimeType} is not found.
	*/
	public String getMimeTypeByMagic(byte[] data) {
	try {
	return tika.detect(data);
	} catch (Exception e) {
	return null;
	}
	}

	public String getDescriptionForMimeType(String mimeType) {
	try {
	return this.mimeTypes.forName(mimeType).getDescription();
	}catch (Exception e) {
	LOG.log(Level.WARNING, "Failed to get description for mimetype "
	+ mimeType + " : " + e.getMessage());
	return null;
	}
	}

	public String getSuperTypeForMimeType(String mimeType) {
	try {
	MediaType mediaType = this.mimeTypes.getMediaTypeRegistry().getSupertype(this.mimeTypes.forName(mimeType).getType());
	if (mediaType != null)
	return mediaType.getType() + "/" + mediaType.getSubtype();
	else
	return null;
	}catch (Exception e) {
	LOG.log(Level.WARNING, "Failed to get super-type for mimetype "
	+ mimeType + " : " + e.getMessage());
	return null;
	}
	}

	/**
	* @return the mimeMagic
	*/
	public boolean isMimeMagic() {
	return mimeMagic;
	}

	/**
	* @param mimeMagic the mimeMagic to set
	*/
	public void setMimeMagic(boolean mimeMagic) {
	this.mimeMagic = mimeMagic;
	}

	public static byte[] readMagicHeader(InputStream stream) throws IOException {
	return readMagicHeader(stream, 1024);
	}

	public static byte[] readMagicHeader(InputStream stream, int headerByteSize)
	throws IOException {
	if (stream == null) {
	throw new IllegalArgumentException("InputStream is missing");
	}

	byte[] bytes = new byte[headerByteSize];
	int totalRead = 0;

	int lastRead = stream.read(bytes);
	while (lastRead != -1) {
	totalRead += lastRead;
	if (totalRead == bytes.length) {
	return bytes;
	}
	lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
	}

	byte[] shorter = new byte[totalRead];
	System.arraycopy(bytes, 0, shorter, 0, totalRead);
	return shorter;
	}

	}