src/mapred/org/apache/hadoop/mapred/SequenceFileInputFilter.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.mapred;

 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.security.DigestException;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.ReflectionUtils;

 /**
  * A class that allows a map/red job to work on a sample of sequence files.
  * The sample is decided by the filter class set by the job.
  *
  */

 public class SequenceFileInputFilter<K, V>
   extends SequenceFileInputFormat<K, V> {

   final private static String FILTER_CLASS = "sequencefile.filter.class";
   final private static String FILTER_FREQUENCY
     = "sequencefile.filter.frequency";
   final private static String FILTER_REGEX = "sequencefile.filter.regex";

   public SequenceFileInputFilter() {
   }

   /** Create a record reader for the given split
    * @param split file split
    * @param job job configuration
    * @param reporter reporter who sends report to task tracker
    * @return RecordReader
    */
   public RecordReader<K, V> getRecordReader(InputSplit split,
                                       JobConf job, Reporter reporter)
     throws IOException {

     reporter.setStatus(split.toString());

     return new FilterRecordReader<K, V>(job, (FileSplit) split);
   }


   /** set the filter class
    *
    * @param conf application configuration
    * @param filterClass filter class
    */
   public static void setFilterClass(Configuration conf, Class filterClass) {
     conf.set(FILTER_CLASS, filterClass.getName());
   }


   /**
    * filter interface
    */
   public interface Filter extends Configurable {
     /** filter function
      * Decide if a record should be filtered or not
      * @param key record key
      * @return true if a record is accepted; return false otherwise
      */
     public abstract boolean accept(Object key);
   }

   /**
    * base class for Filters
    */
   public static abstract class FilterBase implements Filter {
     Configuration conf;

     public Configuration getConf() {
       return conf;
     }
   }

   /** Records filter by matching key to regex
    */
   public static class RegexFilter extends FilterBase {
     private Pattern p;
     /** Define the filtering regex and stores it in conf
      * @param conf where the regex is set
      * @param regex regex used as a filter
      */
     public static void setPattern(Configuration conf, String regex)
       throws PatternSyntaxException {
       try {
         Pattern.compile(regex);
       } catch (PatternSyntaxException e) {
         throw new IllegalArgumentException("Invalid pattern: "+regex);
       }
       conf.set(FILTER_REGEX, regex);
     }

     public RegexFilter() { }

     /** configure the Filter by checking the configuration
      */
     public void setConf(Configuration conf) {
       String regex = conf.get(FILTER_REGEX);
       if (regex==null)
         throw new RuntimeException(FILTER_REGEX + "not set");
       this.p = Pattern.compile(regex);
       this.conf = conf;
     }


     /** Filtering method
      * If key matches the regex, return true; otherwise return false
      * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
      */
     public boolean accept(Object key) {
       return p.matcher(key.toString()).matches();
     }
   }

   /** This class returns a percentage of records
    * The percentage is determined by a filtering frequency <i>f</i> using
    * the criteria record# % f == 0.
    * For example, if the frequency is 10, one out of 10 records is returned.
    */
   public static class PercentFilter extends FilterBase {
     private int frequency;
     private int count;

     /** set the frequency and stores it in conf
      * @param conf configuration
      * @param frequency filtering frequencey
      */
     public static void setFrequency(Configuration conf, int frequency){
       if (frequency<=0)
         throw new IllegalArgumentException(
                                            "Negative " + FILTER_FREQUENCY + ": "+frequency);
       conf.setInt(FILTER_FREQUENCY, frequency);
     }

     public PercentFilter() { }

     /** configure the filter by checking the configuration
      *
      * @param conf configuration
      */
     public void setConf(Configuration conf) {
       this.frequency = conf.getInt("sequencefile.filter.frequency", 10);
       if (this.frequency <=0) {
         throw new RuntimeException(
                                    "Negative "+FILTER_FREQUENCY+": "+this.frequency);
       }
       this.conf = conf;
     }

     /** Filtering method
      * If record# % frequency==0, return true; otherwise return false
      * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
      */
     public boolean accept(Object key) {
       boolean accepted = false;
       if (count == 0)
         accepted = true;
       if (++count == frequency) {
         count = 0;
       }
       return accepted;
     }
   }

   /** This class returns a set of records by examing the MD5 digest of its
    * key against a filtering frequency <i>f</i>. The filtering criteria is
    * MD5(key) % f == 0.
    */
   public static class MD5Filter extends FilterBase {
     private int frequency;
     private static final MessageDigest DIGESTER;
     public static final int MD5_LEN = 16;
     private byte [] digest = new byte[MD5_LEN];

     static {
       try {
         DIGESTER = MessageDigest.getInstance("MD5");
       } catch (NoSuchAlgorithmException e) {
         throw new RuntimeException(e);
       }
     }


     /** set the filtering frequency in configuration
      *
      * @param conf configuration
      * @param frequency filtering frequency
      */
     public static void setFrequency(Configuration conf, int frequency){
       if (frequency<=0)
         throw new IllegalArgumentException(
                                            "Negative " + FILTER_FREQUENCY + ": "+frequency);
       conf.setInt(FILTER_FREQUENCY, frequency);
     }

     public MD5Filter() { }

     /** configure the filter according to configuration
      *
      * @param conf configuration
      */
     public void setConf(Configuration conf) {
       this.frequency = conf.getInt(FILTER_FREQUENCY, 10);
       if (this.frequency <=0) {
         throw new RuntimeException(
                                    "Negative "+FILTER_FREQUENCY+": "+this.frequency);
       }
       this.conf = conf;
     }

     /** Filtering method
      * If MD5(key) % frequency==0, return true; otherwise return false
      * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
      */
     public boolean accept(Object key) {
       try {
         long hashcode;
         if (key instanceof Text) {
           hashcode = MD5Hashcode((Text)key);
         } else if (key instanceof BytesWritable) {
           hashcode = MD5Hashcode((BytesWritable)key);
         } else {
           ByteBuffer bb;
           bb = Text.encode(key.toString());
           hashcode = MD5Hashcode(bb.array(), 0, bb.limit());
         }
         if (hashcode/frequency*frequency==hashcode)
           return true;
       } catch(Exception e) {
         LOG.warn(e);
         throw new RuntimeException(e);
       }
       return false;
     }

     private long MD5Hashcode(Text key) throws DigestException {
       return MD5Hashcode(key.getBytes(), 0, key.getLength());
     }

     private long MD5Hashcode(BytesWritable key) throws DigestException {
       return MD5Hashcode(key.getBytes(), 0, key.getLength());
     }
     synchronized private long MD5Hashcode(byte[] bytes,
                                           int start, int length) throws DigestException {
       DIGESTER.update(bytes, 0, length);
       DIGESTER.digest(digest, 0, MD5_LEN);
       long hashcode=0;
       for (int i = 0; i < 8; i++)
         hashcode |= ((digest[i] & 0xffL) << (8*(7-i)));
       return hashcode;
     }
   }

   private static class FilterRecordReader<K, V>
     extends SequenceFileRecordReader<K, V> {

     private Filter filter;

     public FilterRecordReader(Configuration conf, FileSplit split)
       throws IOException {
       super(conf, split);
       // instantiate filter
       filter = (Filter)ReflectionUtils.newInstance(
                                                    conf.getClass(FILTER_CLASS, PercentFilter.class),
                                                    conf);
     }

     public synchronized boolean next(K key, V value) throws IOException {
       while (next(key)) {
         if (filter.accept(key)) {
           getCurrentValue(value);
           return true;
         }
       }

       return false;
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.mapred;

	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.security.DigestException;
	import java.security.MessageDigest;
	import java.security.NoSuchAlgorithmException;
	import java.util.regex.Pattern;
	import java.util.regex.PatternSyntaxException;

	import org.apache.hadoop.conf.Configurable;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.BytesWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.util.ReflectionUtils;

	/**
	* A class that allows a map/red job to work on a sample of sequence files.
	* The sample is decided by the filter class set by the job.
	*
	*/

	public class SequenceFileInputFilter<K, V>
	extends SequenceFileInputFormat<K, V> {

	final private static String FILTER_CLASS = "sequencefile.filter.class";
	final private static String FILTER_FREQUENCY
	= "sequencefile.filter.frequency";
	final private static String FILTER_REGEX = "sequencefile.filter.regex";

	public SequenceFileInputFilter() {
	}

	/** Create a record reader for the given split
	* @param split file split
	* @param job job configuration
	* @param reporter reporter who sends report to task tracker
	* @return RecordReader
	*/
	public RecordReader<K, V> getRecordReader(InputSplit split,
	JobConf job, Reporter reporter)
	throws IOException {

	reporter.setStatus(split.toString());

	return new FilterRecordReader<K, V>(job, (FileSplit) split);
	}


	/** set the filter class
	*
	* @param conf application configuration
	* @param filterClass filter class
	*/
	public static void setFilterClass(Configuration conf, Class filterClass) {
	conf.set(FILTER_CLASS, filterClass.getName());
	}


	/**
	* filter interface
	*/
	public interface Filter extends Configurable {
	/** filter function
	* Decide if a record should be filtered or not
	* @param key record key
	* @return true if a record is accepted; return false otherwise
	*/
	public abstract boolean accept(Object key);
	}

	/**
	* base class for Filters
	*/
	public static abstract class FilterBase implements Filter {
	Configuration conf;

	public Configuration getConf() {
	return conf;
	}
	}

	/** Records filter by matching key to regex
	*/
	public static class RegexFilter extends FilterBase {
	private Pattern p;
	/** Define the filtering regex and stores it in conf
	* @param conf where the regex is set
	* @param regex regex used as a filter
	*/
	public static void setPattern(Configuration conf, String regex)
	throws PatternSyntaxException {
	try {
	Pattern.compile(regex);
	} catch (PatternSyntaxException e) {
	throw new IllegalArgumentException("Invalid pattern: "+regex);
	}
	conf.set(FILTER_REGEX, regex);
	}

	public RegexFilter() { }

	/** configure the Filter by checking the configuration
	*/
	public void setConf(Configuration conf) {
	String regex = conf.get(FILTER_REGEX);
	if (regex==null)
	throw new RuntimeException(FILTER_REGEX + "not set");
	this.p = Pattern.compile(regex);
	this.conf = conf;
	}


	/** Filtering method
	* If key matches the regex, return true; otherwise return false
	* @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
	*/
	public boolean accept(Object key) {
	return p.matcher(key.toString()).matches();
	}
	}

	/** This class returns a percentage of records
	* The percentage is determined by a filtering frequency <i>f</i> using
	* the criteria record# % f == 0.
	* For example, if the frequency is 10, one out of 10 records is returned.
	*/
	public static class PercentFilter extends FilterBase {
	private int frequency;
	private int count;

	/** set the frequency and stores it in conf
	* @param conf configuration
	* @param frequency filtering frequencey
	*/
	public static void setFrequency(Configuration conf, int frequency){
	if (frequency<=0)
	throw new IllegalArgumentException(
	"Negative " + FILTER_FREQUENCY + ": "+frequency);
	conf.setInt(FILTER_FREQUENCY, frequency);
	}

	public PercentFilter() { }

	/** configure the filter by checking the configuration
	*
	* @param conf configuration
	*/
	public void setConf(Configuration conf) {
	this.frequency = conf.getInt("sequencefile.filter.frequency", 10);
	if (this.frequency <=0) {
	throw new RuntimeException(
	"Negative "+FILTER_FREQUENCY+": "+this.frequency);
	}
	this.conf = conf;
	}

	/** Filtering method
	* If record# % frequency==0, return true; otherwise return false
	* @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
	*/
	public boolean accept(Object key) {
	boolean accepted = false;
	if (count == 0)
	accepted = true;
	if (++count == frequency) {
	count = 0;
	}
	return accepted;
	}
	}

	/** This class returns a set of records by examing the MD5 digest of its
	* key against a filtering frequency <i>f</i>. The filtering criteria is
	* MD5(key) % f == 0.
	*/
	public static class MD5Filter extends FilterBase {
	private int frequency;
	private static final MessageDigest DIGESTER;
	public static final int MD5_LEN = 16;
	private byte [] digest = new byte[MD5_LEN];

	static {
	try {
	DIGESTER = MessageDigest.getInstance("MD5");
	} catch (NoSuchAlgorithmException e) {
	throw new RuntimeException(e);
	}
	}


	/** set the filtering frequency in configuration
	*
	* @param conf configuration
	* @param frequency filtering frequency
	*/
	public static void setFrequency(Configuration conf, int frequency){
	if (frequency<=0)
	throw new IllegalArgumentException(
	"Negative " + FILTER_FREQUENCY + ": "+frequency);
	conf.setInt(FILTER_FREQUENCY, frequency);
	}

	public MD5Filter() { }

	/** configure the filter according to configuration
	*
	* @param conf configuration
	*/
	public void setConf(Configuration conf) {
	this.frequency = conf.getInt(FILTER_FREQUENCY, 10);
	if (this.frequency <=0) {
	throw new RuntimeException(
	"Negative "+FILTER_FREQUENCY+": "+this.frequency);
	}
	this.conf = conf;
	}

	/** Filtering method
	* If MD5(key) % frequency==0, return true; otherwise return false
	* @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
	*/
	public boolean accept(Object key) {
	try {
	long hashcode;
	if (key instanceof Text) {
	hashcode = MD5Hashcode((Text)key);
	} else if (key instanceof BytesWritable) {
	hashcode = MD5Hashcode((BytesWritable)key);
	} else {
	ByteBuffer bb;
	bb = Text.encode(key.toString());
	hashcode = MD5Hashcode(bb.array(), 0, bb.limit());
	}
	if (hashcode/frequency*frequency==hashcode)
	return true;
	} catch(Exception e) {
	LOG.warn(e);
	throw new RuntimeException(e);
	}
	return false;
	}

	private long MD5Hashcode(Text key) throws DigestException {
	return MD5Hashcode(key.getBytes(), 0, key.getLength());
	}

	private long MD5Hashcode(BytesWritable key) throws DigestException {
	return MD5Hashcode(key.getBytes(), 0, key.getLength());
	}
	synchronized private long MD5Hashcode(byte[] bytes,
	int start, int length) throws DigestException {
	DIGESTER.update(bytes, 0, length);
	DIGESTER.digest(digest, 0, MD5_LEN);
	long hashcode=0;
	for (int i = 0; i < 8; i++)
	hashcode \|= ((digest[i] & 0xffL) << (8*(7-i)));
	return hashcode;
	}
	}

	private static class FilterRecordReader<K, V>
	extends SequenceFileRecordReader<K, V> {

	private Filter filter;

	public FilterRecordReader(Configuration conf, FileSplit split)
	throws IOException {
	super(conf, split);
	// instantiate filter
	filter = (Filter)ReflectionUtils.newInstance(
	conf.getClass(FILTER_CLASS, PercentFilter.class),
	conf);
	}

	public synchronized boolean next(K key, V value) throws IOException {
	while (next(key)) {
	if (filter.accept(key)) {
	getCurrentValue(value);
	return true;
	}
	}

	return false;
	}
	}
	}