| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.util; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.lang.invoke.MethodHandles; |
| |
| import org.apache.hadoop.conf.Configuration; |
| |
| import org.apache.tika.Tika; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MimeType; |
| import org.apache.tika.mime.MimeTypeException; |
| import org.apache.tika.mime.MimeTypes; |
| import org.apache.tika.mime.MimeTypesFactory; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.nutch.protocol.ProtocolOutput; |
| |
| /** |
| * @author mattmann |
| * @since NUTCH-608 |
| * |
| * <p> |
| * This is a facade class to insulate Nutch from its underlying Mime Type |
| * substrate library, <a href="http://incubator.apache.org/tika/">Apache |
| * Tika</a>. Any mime handling code should be placed in this utility |
| * class, and hidden from the Nutch classes that rely on it. |
| * </p> |
| */ |
| public final class MimeUtil { |
| |
| private static final String SEPARATOR = ";"; |
| |
| /* our Tika mime type registry */ |
| private MimeTypes mimeTypes; |
| |
| /* the tika detectors */ |
| private Tika tika; |
| |
| /* whether or not magic should be employed or not */ |
| private boolean mimeMagic; |
| |
| /* our log stream */ |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public MimeUtil(Configuration conf) { |
| ObjectCache objectCache = ObjectCache.get(conf); |
| tika = (Tika) objectCache.getObject(Tika.class.getName()); |
| if (tika == null) { |
| tika = new Tika(); |
| objectCache.setObject(Tika.class.getName(), tika); |
| } |
| MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class |
| .getName()); |
| if (mimeTypez == null) { |
| try { |
| String customMimeTypeFile = conf.get("mime.types.file"); |
| if (customMimeTypeFile != null |
| && customMimeTypeFile.equals("") == false) { |
| try { |
| LOG.info("Using custom mime.types.file: {}", customMimeTypeFile); |
| mimeTypez = MimeTypesFactory.create(conf |
| .getConfResourceAsInputStream(customMimeTypeFile)); |
| } catch (Exception e) { |
| LOG.error("Can't load mime.types.file : " + customMimeTypeFile |
| + " using Tika's default"); |
| } |
| } |
| if (mimeTypez == null) |
| mimeTypez = MimeTypes.getDefaultMimeTypes(); |
| } catch (Exception e) { |
| LOG.error("Exception in MimeUtil " + e.getMessage()); |
| throw new RuntimeException(e); |
| } |
| objectCache.setObject(MimeTypes.class.getName(), mimeTypez); |
| } |
| |
| this.mimeTypes = mimeTypez; |
| this.mimeMagic = conf.getBoolean("mime.type.magic", true); |
| } |
| |
| /** |
| * Cleans a {@link MimeType} name by removing out the actual {@link MimeType}, |
| * from a string of the form: |
| * |
| * <pre> |
| * <primary type>/<sub type> ; < optional params |
| * </pre> |
| * |
| * @param origType |
| * The original mime type string to be cleaned. |
| * @return The primary type, and subtype, concatenated, e.g., the actual mime |
| * type. |
| */ |
| public static String cleanMimeType(String origType) { |
| if (origType == null) |
| return null; |
| |
| // take the origType and split it on ';' |
| String[] tokenizedMimeType = origType.split(SEPARATOR); |
| if (tokenizedMimeType.length > 1) { |
| // there was a ';' in there, take the first value |
| return tokenizedMimeType[0]; |
| } else { |
| // there wasn't a ';', so just return the orig type |
| return origType; |
| } |
| } |
| |
| /** |
| * A facade interface to trying all the possible mime type resolution |
| * strategies available within Tika. First, the mime type provided in |
| * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then |
| * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes} |
| * registry, by its cleaned name. If the {@link MimeType} is found, then that |
| * mime type is used, otherwise URL resolution is used to try and determine |
| * the mime type. However, if <code>mime.type.magic</code> is enabled in |
| * {@link NutchConfiguration}, then mime type magic resolution is used to try |
| * and obtain a better-than-the-default approximation of the {@link MimeType}. |
| * |
| * @param typeName |
| * The original mime type, returned from a {@link ProtocolOutput}. |
| * @param url |
| * The given @see url, that Nutch was trying to crawl. |
| * @param data |
| * The byte data, returned from the crawl, if any. |
| * @return The correctly, automatically guessed {@link MimeType} name. |
| */ |
| public String autoResolveContentType(String typeName, String url, byte[] data) { |
| String retType = null; |
| MimeType type = null; |
| String cleanedMimeType = null; |
| |
| cleanedMimeType = MimeUtil.cleanMimeType(typeName); |
| // first try to get the type from the cleaned type name |
| if (cleanedMimeType != null) { |
| try { |
| type = mimeTypes.forName(cleanedMimeType); |
| cleanedMimeType = type.getName(); |
| } catch (MimeTypeException mte) { |
| // Seems to be a malformed mime type name... |
| cleanedMimeType = null; |
| } |
| } |
| |
| // if returned null, or if it's the default type then try url resolution |
| if (type == null || type.getName().equals(MimeTypes.OCTET_STREAM)) { |
| // If no mime-type header, or cannot find a corresponding registered |
| // mime-type, then guess a mime-type from the url pattern |
| try { |
| retType = tika.detect(url) != null ? tika.detect(url) : null; |
| } catch (Exception e) { |
| String message = "Problem loading default Tika configuration"; |
| LOG.error(message, e); |
| throw new RuntimeException(e); |
| } |
| } else { |
| retType = type.getName(); |
| } |
| |
| // if magic is enabled use mime magic to guess if the mime type returned |
| // from the magic guess is different than the one that's already set so far |
| // if it is, and it's not the default mime type, then go with the mime type |
| // returned by the magic |
| if (this.mimeMagic) { |
| String magicType = null; |
| // pass URL (file name) and (cleansed) content type from protocol to Tika |
| Metadata tikaMeta = new Metadata(); |
| tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url); |
| tikaMeta.add(Metadata.CONTENT_TYPE, |
| (cleanedMimeType != null ? cleanedMimeType : typeName)); |
| try { |
| try (InputStream stream = TikaInputStream.get(data)) { |
| magicType = mimeTypes.detect(stream, tikaMeta).toString(); |
| } |
| } catch (IOException ignore) { |
| } |
| |
| if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM) |
| && retType != null && !retType.equals(magicType)) { |
| |
| // If magic enabled and the current mime type differs from that of the |
| // one returned from the magic, take the magic mimeType |
| retType = magicType; |
| } |
| |
| // if type is STILL null after all the resolution strategies, go for the |
| // default type |
| if (retType == null) { |
| try { |
| retType = MimeTypes.OCTET_STREAM; |
| } catch (Exception ignore) { |
| } |
| } |
| } |
| |
| return retType; |
| } |
| |
| /** |
| * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)} |
| * method. |
| * |
| * @param url |
| * A string representation of the document URL to sense the |
| * {@link org.apache.tika.mime.MimeType MimeType} for. |
| * @return An appropriate {@link MimeType}, identified from the given Document |
| * url in string form. |
| */ |
| public String getMimeType(String url) { |
| return tika.detect(url); |
| } |
| |
| /** |
| * A facade interface to Tika's underlying {@link MimeTypes#forName(String)} |
| * method. |
| * |
| * @param name |
| * The name of a valid {@link MimeType} in the Tika mime registry. |
| * @return The object representation of the {@link MimeType}, if it exists, or |
| * null otherwise. |
| */ |
| public String forName(String name) { |
| try { |
| return this.mimeTypes.forName(name).toString(); |
| } catch (MimeTypeException e) { |
| LOG.error("Exception getting mime type by name: [" + name |
| + "]: Message: " + e.getMessage()); |
| return null; |
| } |
| } |
| |
| /** |
| * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)} |
| * method. |
| * |
| * @param f |
| * The {@link File} to sense the {@link MimeType} for. |
| * @return The {@link MimeType} of the given {@link File}, or null if it |
| * cannot be determined. |
| */ |
| public String getMimeType(File f) { |
| try { |
| return tika.detect(f); |
| } catch (Exception e) { |
| LOG.error("Exception getting mime type for file: [" + f.getPath() |
| + "]: Message: " + e.getMessage()); |
| return null; |
| } |
| } |
| |
| } |