src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.indexer.more;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.nutch.metadata.Metadata;

 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;

 import org.apache.nutch.parse.Parse;

 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;

 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.tika.Tika;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;

 import java.text.ParseException;

 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.time.DateUtils;

 import java.io.File;
 import java.net.URL;
 import java.util.List;
 import java.util.ArrayList;
 import org.apache.commons.io.FileUtils;
 import java.nio.charset.StandardCharsets;

 /**
  * Add (or reset) a few metaData properties as respective fields (if they are
  * available), so that they can be accurately used within the search index.
  *
  * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
  * content length from the HTTP header, 'type' field is indexed to support query
  * by type and finally the 'title' field is an attempt to reset the title if a
  * content-disposition hint exists. The logic is that such a presence is
  * indicative that the content provider wants the filename therein to be used as
  * the title.
  *
  * Still need to make content-length searchable!
  *
  * @author John Xing
  */

 public class MoreIndexingFilter implements IndexingFilter {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   /** Get the MimeTypes resolver instance. */
   private MimeUtil MIME;
   private Tika tika = new Tika();

   /** Map for mime-type substitution */
   private HashMap<String, String> mimeMap = null;
   private boolean mapMimes = false;
   private String mapFieldName;

   /** Date-styles used to parse date. */
   private String[] defaultDateStyles = new String[] {
             "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
             "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
             "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
             "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
             "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
             "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
             "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
             "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
             "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
             "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
             "yyyy-MM-dd'T'HH:mm:ssXXX" };
   private String[] dateStyles = null;

   @Override
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
       CrawlDatum datum, Inlinks inlinks) throws IndexingException {

     String url_s = url.toString();

     addTime(doc, parse.getData(), url_s, datum);
     addLength(doc, parse.getData());
     addType(doc, parse.getData(), url_s, datum);
     resetTitle(doc, parse.getData());

     return doc;
   }

   // Add time related meta info. Add last-modified if present. Index date as
   // last-modified, or, if that's not present, use fetch time.
   private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
       CrawlDatum datum) {
     long time = -1;

     String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
     if (lastModified != null) { // try parse last-modified
       time = getTime(lastModified, url); // use as time
                                          // store as Date
       if (time > -1) {
         doc.add("lastModified", new Date(time));
       }
     }

     if (time == -1) { // if no last-modified specified in HTTP header
       time = datum.getModifiedTime(); // use value in CrawlDatum
       if (time <= 0) { // if also unset
         time = datum.getFetchTime(); // use time the fetch took place (fetchTime
                                      // of fetchDatum)
       }
     }

     // un-stored, indexed and un-tokenized
     doc.add("date", new Date(time));
     return doc;
   }

   private long getTime(String date, String url) {
     long time = -1;

     try {
       time = HttpDateFormat.toLong(date);
     } catch (ParseException e) {
       // try to parse it as date in alternative format
       try {
         Date parsedDate = DateUtils.parseDate(date, dateStyles);
         time = parsedDate.getTime();
         LOG.info(url + ": parsed date: " + date +" to: " + time);
       } catch (Exception e2) {
         if (LOG.isWarnEnabled()) {
           LOG.warn(url + ": can't parse erroneous date: " + date);
         }
       }
     }

     return time;
   }

   // Add Content-Length
   private NutchDocument addLength(NutchDocument doc, ParseData data) {
     String contentLength = data.getMeta(Response.CONTENT_LENGTH);

     if (contentLength != null) {
       // NUTCH-1010 ContentLength not trimmed
       String trimmed = contentLength.trim();
       if (!trimmed.isEmpty())
         doc.add("contentLength", trimmed);
     }
     return doc;
   }

   /**
    * <p>
    * Add Content-Type and its primaryType and subType add contentType,
    * primaryType and subType to field "type" as un-stored, indexed and
    * un-tokenized, so that search results can be confined by contentType or its
    * primaryType or its subType.
    * </p>
    * <p>
    * For example, if contentType is application/vnd.ms-powerpoint, search can be
    * done with one of the following qualifiers
    * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
    * all case insensitive. The query filter is implemented in
    * {@link TypeQueryFilter}.
    * </p>
    *
    * @param doc
    * @param data
    * @param url
    * @return
    */
   private NutchDocument addType(NutchDocument doc, ParseData data, String url,
       CrawlDatum datum) {
     String mimeType = null;
     String contentType = null;

     Writable tcontentType = datum.getMetaData().get(
         new Text(Response.CONTENT_TYPE));

     if (tcontentType != null) {
       contentType = tcontentType.toString();
     } else {
       contentType = data.getMeta(Response.CONTENT_TYPE);
     }

     if (contentType == null) {
       // Note by Jerome Charron on 20050415:
       // Content Type not solved by a previous plugin
       // Or unable to solve it... Trying to find it
       // Should be better to use the doc content too
       // (using MimeTypes.getMimeType(byte[], String), but I don't know
       // which field it is?
       // if (MAGIC) {
       // contentType = MIME.getMimeType(url, content);
       // } else {
       // contentType = MIME.getMimeType(url);
       // }

       mimeType = tika.detect(url);
     } else {
       mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
     }

     // Checks if we solved the content-type.
     if (mimeType == null) {
       return doc;
     }

     // Check if we have to map mime types
     if (mapMimes && mimeMap.containsKey(mimeType)) {
       if (mapFieldName != null) {
         doc.add(mapFieldName, mimeMap.get(mimeType));
       } else {
         mimeType = mimeMap.get(mimeType);
       }
     }

     contentType = mimeType;
     doc.add("type", contentType);

     // Check if we need to split the content type in sub parts
     if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
       String[] parts = getParts(contentType);

       for (String part : parts) {
         doc.add("type", part);
       }
     }

     // leave this for future improvement
     // MimeTypeParameterList parameterList = mimeType.getParameters()

     return doc;
   }

   /**
    * Utility method for splitting mime type into type and subtype.
    *
    * @param mimeType
    * @return
    */
   static String[] getParts(String mimeType) {
     return mimeType.split("/");
   }

   // Reset title if we see non-standard HTTP header "Content-Disposition".
   // It's a good indication that content provider wants filename therein
   // be used as the title of this url.

   // Patterns used to extract filename from possible non-standard
   // HTTP header "Content-Disposition". Typically it looks like:
   // Content-Disposition: inline; filename="foo.ppt"
   private Configuration conf;

   static Pattern[] patterns = { null, null };

   static {
     try {
       // order here is important
       patterns[0] = Pattern.compile("\\bfilename=['\"]([^\"]+)");
       patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
     } catch (PatternSyntaxException e) {
       // just ignore
     }
   }

   private NutchDocument resetTitle(NutchDocument doc, ParseData data) {
     String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
     if (contentDisposition == null || doc.getFieldValue("title") != null)
       return doc;

     for (int i = 0; i < patterns.length; i++) {
       Matcher matcher = patterns[i].matcher(contentDisposition);
       if (matcher.find()) {
         doc.add("title", matcher.group(1));
         break;
       }
     }

     return doc;
   }

   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     MIME = new MimeUtil(conf);

     if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false)) {
       mapMimes = true;

       mapFieldName = conf.get("moreIndexingFilter.mapMimeTypes.field");

       // Load the mapping
       try {
         readConfiguration();
       } catch (Exception e) {
         LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
       }
     }

     URL dateStylesResource = conf.getResource("date-styles.txt");
     if (dateStylesResource == null) {
       dateStyles = defaultDateStyles;
       LOG.warn("Can't find resource: date-styles.txt - Defaults will be used.");
     } else {
       try {
         List<String> usedLines = new ArrayList<String>();
         for (String dateStyle: FileUtils.readLines(new File(dateStylesResource.getFile()),
             StandardCharsets.US_ASCII)) {
           if (StringUtils.isBlank(dateStyle) || dateStyle.startsWith("#")) {
             continue;
           }

           usedLines.add(StringUtils.trim(dateStyle));
         }

         dateStyles = new String[usedLines.size()];
         usedLines.toArray(dateStyles);
       } catch (IOException e) {
         LOG.error("Failed to load resource: date-styles.txt");
       }
     }
   }

   @Override
   public Configuration getConf() {
     return this.conf;
   }

   private void readConfiguration() throws IOException {
     LOG.info("Reading content type mappings from file contenttype-mapping.txt");
     try (BufferedReader reader = new BufferedReader(
         conf.getConfResourceAsReader("contenttype-mapping.txt"))) {
       String line;
       String[] parts;
       boolean formatWarningShown = false;

       mimeMap = new HashMap<String, String>();

       while ((line = reader.readLine()) != null) {
         if (StringUtils.isBlank(line) || line.startsWith("#")) {
           continue;
         }

         line = line.trim();
         parts = line.split("\t");

         // Must be at least two parts
         if (parts.length > 1) {
           for (int i = 1; i < parts.length; i++) {
             mimeMap.put(parts[i].trim(), parts[0].trim());
           }
         } else {
           LOG.warn("Wrong format of line: {}", line);
           if (!formatWarningShown) {
             LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]");
             formatWarningShown = true;
           }
         }
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.indexer.more;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.nutch.metadata.Metadata;

	import org.apache.nutch.net.protocols.HttpDateFormat;
	import org.apache.nutch.net.protocols.Response;

	import org.apache.nutch.parse.Parse;

	import org.apache.nutch.indexer.IndexingFilter;
	import org.apache.nutch.indexer.IndexingException;
	import org.apache.nutch.indexer.NutchDocument;

	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.Inlinks;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.util.MimeUtil;
	import org.apache.tika.Tika;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.Writable;

	import java.text.ParseException;

	import java.lang.invoke.MethodHandles;
	import java.io.BufferedReader;
	import java.io.IOException;
	import java.util.Date;
	import java.util.HashMap;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;
	import java.util.regex.PatternSyntaxException;

	import org.apache.commons.lang.StringUtils;
	import org.apache.commons.lang.time.DateUtils;

	import java.io.File;
	import java.net.URL;
	import java.util.List;
	import java.util.ArrayList;
	import org.apache.commons.io.FileUtils;
	import java.nio.charset.StandardCharsets;

	/**
	* Add (or reset) a few metaData properties as respective fields (if they are
	* available), so that they can be accurately used within the search index.
	*
	* 'lastModifed' is indexed to support query by date, 'contentLength' obtains
	* content length from the HTTP header, 'type' field is indexed to support query
	* by type and finally the 'title' field is an attempt to reset the title if a
	* content-disposition hint exists. The logic is that such a presence is
	* indicative that the content provider wants the filename therein to be used as
	* the title.
	*
	* Still need to make content-length searchable!
	*
	* @author John Xing
	*/

	public class MoreIndexingFilter implements IndexingFilter {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	/** Get the MimeTypes resolver instance. */
	private MimeUtil MIME;
	private Tika tika = new Tika();

	/** Map for mime-type substitution */
	private HashMap<String, String> mimeMap = null;
	private boolean mapMimes = false;
	private String mapFieldName;

	/** Date-styles used to parse date. */
	private String[] defaultDateStyles = new String[] {
	"EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
	"EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
	"EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
	"EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
	"EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
	"yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
	"yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
	"MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
	"dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
	"dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
	"yyyy-MM-dd'T'HH:mm:ssXXX" };
	private String[] dateStyles = null;

	@Override
	public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
	CrawlDatum datum, Inlinks inlinks) throws IndexingException {

	String url_s = url.toString();

	addTime(doc, parse.getData(), url_s, datum);
	addLength(doc, parse.getData());
	addType(doc, parse.getData(), url_s, datum);
	resetTitle(doc, parse.getData());

	return doc;
	}

	// Add time related meta info. Add last-modified if present. Index date as
	// last-modified, or, if that's not present, use fetch time.
	private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
	CrawlDatum datum) {
	long time = -1;

	String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
	if (lastModified != null) { // try parse last-modified
	time = getTime(lastModified, url); // use as time
	// store as Date
	if (time > -1) {
	doc.add("lastModified", new Date(time));
	}
	}

	if (time == -1) { // if no last-modified specified in HTTP header
	time = datum.getModifiedTime(); // use value in CrawlDatum
	if (time <= 0) { // if also unset
	time = datum.getFetchTime(); // use time the fetch took place (fetchTime
	// of fetchDatum)
	}
	}

	// un-stored, indexed and un-tokenized
	doc.add("date", new Date(time));
	return doc;
	}

	private long getTime(String date, String url) {
	long time = -1;

	try {
	time = HttpDateFormat.toLong(date);
	} catch (ParseException e) {
	// try to parse it as date in alternative format
	try {
	Date parsedDate = DateUtils.parseDate(date, dateStyles);
	time = parsedDate.getTime();
	LOG.info(url + ": parsed date: " + date +" to: " + time);
	} catch (Exception e2) {
	if (LOG.isWarnEnabled()) {
	LOG.warn(url + ": can't parse erroneous date: " + date);
	}
	}
	}

	return time;
	}

	// Add Content-Length
	private NutchDocument addLength(NutchDocument doc, ParseData data) {
	String contentLength = data.getMeta(Response.CONTENT_LENGTH);

	if (contentLength != null) {
	// NUTCH-1010 ContentLength not trimmed
	String trimmed = contentLength.trim();
	if (!trimmed.isEmpty())
	doc.add("contentLength", trimmed);
	}
	return doc;
	}

	/**
	* <p>
	* Add Content-Type and its primaryType and subType add contentType,
	* primaryType and subType to field "type" as un-stored, indexed and
	* un-tokenized, so that search results can be confined by contentType or its
	* primaryType or its subType.
	* </p>
	* <p>
	* For example, if contentType is application/vnd.ms-powerpoint, search can be
	* done with one of the following qualifiers
	* type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
	* all case insensitive. The query filter is implemented in
	* {@link TypeQueryFilter}.
	* </p>
	*
	* @param doc
	* @param data
	* @param url
	* @return
	*/
	private NutchDocument addType(NutchDocument doc, ParseData data, String url,
	CrawlDatum datum) {
	String mimeType = null;
	String contentType = null;

	Writable tcontentType = datum.getMetaData().get(
	new Text(Response.CONTENT_TYPE));

	if (tcontentType != null) {
	contentType = tcontentType.toString();
	} else {
	contentType = data.getMeta(Response.CONTENT_TYPE);
	}

	if (contentType == null) {
	// Note by Jerome Charron on 20050415:
	// Content Type not solved by a previous plugin
	// Or unable to solve it... Trying to find it
	// Should be better to use the doc content too
	// (using MimeTypes.getMimeType(byte[], String), but I don't know
	// which field it is?
	// if (MAGIC) {
	// contentType = MIME.getMimeType(url, content);
	// } else {
	// contentType = MIME.getMimeType(url);
	// }

	mimeType = tika.detect(url);
	} else {
	mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
	}

	// Checks if we solved the content-type.
	if (mimeType == null) {
	return doc;
	}

	// Check if we have to map mime types
	if (mapMimes && mimeMap.containsKey(mimeType)) {
	if (mapFieldName != null) {
	doc.add(mapFieldName, mimeMap.get(mimeType));
	} else {
	mimeType = mimeMap.get(mimeType);
	}
	}

	contentType = mimeType;
	doc.add("type", contentType);

	// Check if we need to split the content type in sub parts
	if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
	String[] parts = getParts(contentType);

	for (String part : parts) {
	doc.add("type", part);
	}
	}

	// leave this for future improvement
	// MimeTypeParameterList parameterList = mimeType.getParameters()

	return doc;
	}

	/**
	* Utility method for splitting mime type into type and subtype.
	*
	* @param mimeType
	* @return
	*/
	static String[] getParts(String mimeType) {
	return mimeType.split("/");
	}

	// Reset title if we see non-standard HTTP header "Content-Disposition".
	// It's a good indication that content provider wants filename therein
	// be used as the title of this url.

	// Patterns used to extract filename from possible non-standard
	// HTTP header "Content-Disposition". Typically it looks like:
	// Content-Disposition: inline; filename="foo.ppt"
	private Configuration conf;

	static Pattern[] patterns = { null, null };

	static {
	try {
	// order here is important
	patterns[0] = Pattern.compile("\\bfilename=['\"]([^\"]+)");
	patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
	} catch (PatternSyntaxException e) {
	// just ignore
	}
	}

	private NutchDocument resetTitle(NutchDocument doc, ParseData data) {
	String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
	if (contentDisposition == null \|\| doc.getFieldValue("title") != null)
	return doc;

	for (int i = 0; i < patterns.length; i++) {
	Matcher matcher = patterns[i].matcher(contentDisposition);
	if (matcher.find()) {
	doc.add("title", matcher.group(1));
	break;
	}
	}

	return doc;
	}

	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	MIME = new MimeUtil(conf);

	if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false)) {
	mapMimes = true;

	mapFieldName = conf.get("moreIndexingFilter.mapMimeTypes.field");

	// Load the mapping
	try {
	readConfiguration();
	} catch (Exception e) {
	LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
	}
	}

	URL dateStylesResource = conf.getResource("date-styles.txt");
	if (dateStylesResource == null) {
	dateStyles = defaultDateStyles;
	LOG.warn("Can't find resource: date-styles.txt - Defaults will be used.");
	} else {
	try {
	List<String> usedLines = new ArrayList<String>();
	for (String dateStyle: FileUtils.readLines(new File(dateStylesResource.getFile()),
	StandardCharsets.US_ASCII)) {
	if (StringUtils.isBlank(dateStyle) \|\| dateStyle.startsWith("#")) {
	continue;
	}

	usedLines.add(StringUtils.trim(dateStyle));
	}

	dateStyles = new String[usedLines.size()];
	usedLines.toArray(dateStyles);
	} catch (IOException e) {
	LOG.error("Failed to load resource: date-styles.txt");
	}
	}
	}

	@Override
	public Configuration getConf() {
	return this.conf;
	}

	private void readConfiguration() throws IOException {
	LOG.info("Reading content type mappings from file contenttype-mapping.txt");
	try (BufferedReader reader = new BufferedReader(
	conf.getConfResourceAsReader("contenttype-mapping.txt"))) {
	String line;
	String[] parts;
	boolean formatWarningShown = false;

	mimeMap = new HashMap<String, String>();

	while ((line = reader.readLine()) != null) {
	if (StringUtils.isBlank(line) \|\| line.startsWith("#")) {
	continue;
	}

	line = line.trim();
	parts = line.split("\t");

	// Must be at least two parts
	if (parts.length > 1) {
	for (int i = 1; i < parts.length; i++) {
	mimeMap.put(parts[i].trim(), parts[0].trim());
	}
	} else {
	LOG.warn("Wrong format of line: {}", line);
	if (!formatWarningShown) {
	LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]");
	formatWarningShown = true;
	}
	}
	}
	}
	}
	}