shims/src/hadoop20/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigMapReduce.java - pig - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;

 import java.io.ByteArrayOutputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.jobcontrol.Job;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapreduce.Reducer.Context;
 import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
 import org.apache.pig.impl.io.NullableTuple;
 import org.apache.pig.impl.io.PigNullableWritable;
 import org.apache.pig.impl.util.Pair;
 import org.apache.pig.pen.FakeRawKeyValueIterator;

 public class PigMapReduce extends PigGenericMapReduce {
     public static class Reduce extends PigGenericMapReduce.Reduce {
         /**
          * Get reducer's illustrator context
          *
          * @param input Input buffer as output by maps
          * @param pkg package
          * @return reducer's illustrator context
          * @throws IOException
          * @throws InterruptedException
          */
         @Override
         public Context getIllustratorContext(Job job,
                List<Pair<PigNullableWritable, Writable>> input, POPackage pkg) throws IOException, InterruptedException {
             return new IllustratorContext(job, input, pkg);
         }

         @SuppressWarnings("unchecked")
         public class IllustratorContext extends Context {
             private PigNullableWritable currentKey = null, nextKey = null;
             private NullableTuple nextValue = null;
             private List<NullableTuple> currentValues = null;
             private Iterator<Pair<PigNullableWritable, Writable>> it;
             private final ByteArrayOutputStream bos;
             private final DataOutputStream dos;
             private final RawComparator sortComparator, groupingComparator;
             POPackage pack = null;

             public IllustratorContext(Job job,
                   List<Pair<PigNullableWritable, Writable>> input,
                   POPackage pkg
                   ) throws IOException, InterruptedException {
                 super(job.getJobConf(), new TaskAttemptID(), new FakeRawKeyValueIterator(input.iterator().hasNext()),
                     null, null, null, null, new IllustrateDummyReporter(), null, PigNullableWritable.class, NullableTuple.class);
                 bos = new ByteArrayOutputStream();
                 dos = new DataOutputStream(bos);
                 org.apache.hadoop.mapreduce.Job nwJob = new org.apache.hadoop.mapreduce.Job(job.getJobConf());
                 sortComparator = nwJob.getSortComparator();
                 groupingComparator = nwJob.getGroupingComparator();

                 Collections.sort(input, new Comparator<Pair<PigNullableWritable, Writable>>() {
                         @Override
                         public int compare(Pair<PigNullableWritable, Writable> o1,
                                            Pair<PigNullableWritable, Writable> o2) {
                             try {
                                 o1.first.write(dos);
                                 int l1 = bos.size();
                                 o2.first.write(dos);
                                 int l2 = bos.size();
                                 byte[] bytes = bos.toByteArray();
                                 bos.reset();
                                 return sortComparator.compare(bytes, 0, l1, bytes, l1, l2-l1);
                             } catch (IOException e) {
                                 throw new RuntimeException("Serialization exception in sort:"+e.getMessage());
                             }
                         }
                     }
                 );
                 currentValues = new ArrayList<NullableTuple>();
                 it = input.iterator();
                 if (it.hasNext()) {
                     Pair<PigNullableWritable, Writable> entry = it.next();
                     nextKey = entry.first;
                     nextValue = (NullableTuple) entry.second;
                 }
                 pack = pkg;
             }

             @Override
             public PigNullableWritable getCurrentKey() {
                 return currentKey;
             }

             @Override
             public boolean nextKey() {
                 if (nextKey == null)
                     return false;
                 currentKey = nextKey;
                 currentValues.clear();
                 currentValues.add(nextValue);
                 nextKey = null;
                 for(; it.hasNext(); ) {
                     Pair<PigNullableWritable, Writable> entry = it.next();
                     /* Why can't raw comparison be used?
                     byte[] bytes;
                     int l1, l2;
                     try {
                         currentKey.write(dos);
                         l1 = bos.size();
                         entry.first.write(dos);
                         l2 = bos.size();
                         bytes = bos.toByteArray();
                     } catch (IOException e) {
                         throw new RuntimeException("nextKey exception : "+e.getMessage());
                     }
                     bos.reset();
                     if (groupingComparator.compare(bytes, 0, l1, bytes, l1, l2-l1) == 0)
                     */
                     if (groupingComparator.compare(currentKey, entry.first) == 0)
                     {
                         currentValues.add((NullableTuple)entry.second);
                     } else {
                         nextKey = entry.first;
                         nextValue = (NullableTuple) entry.second;
                         break;
                     }
                 }
                 return true;
             }

             @Override
             public Iterable<NullableTuple> getValues() {
                 return currentValues;
             }

             @Override
             public void write(PigNullableWritable k, Writable t) {
             }

             @Override
             public void progress() {
             }
         }

         @Override
         public boolean inIllustrator(
                 org.apache.hadoop.mapreduce.Reducer.Context context) {
             return (context instanceof PigMapReduce.Reduce.IllustratorContext);
         }

         @Override
         public POPackage getPack(
                 org.apache.hadoop.mapreduce.Reducer.Context context) {
             return ((PigMapReduce.Reduce.IllustratorContext) context).pack;
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;

	import java.io.ByteArrayOutputStream;
	import java.io.DataOutputStream;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.Iterator;
	import java.util.List;

	import org.apache.hadoop.io.RawComparator;
	import org.apache.hadoop.io.Writable;
	import org.apache.hadoop.mapred.jobcontrol.Job;
	import org.apache.hadoop.mapreduce.TaskAttemptID;
	import org.apache.hadoop.mapreduce.Reducer.Context;
	import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
	import org.apache.pig.impl.io.NullableTuple;
	import org.apache.pig.impl.io.PigNullableWritable;
	import org.apache.pig.impl.util.Pair;
	import org.apache.pig.pen.FakeRawKeyValueIterator;

	public class PigMapReduce extends PigGenericMapReduce {
	public static class Reduce extends PigGenericMapReduce.Reduce {
	/**
	* Get reducer's illustrator context
	*
	* @param input Input buffer as output by maps
	* @param pkg package
	* @return reducer's illustrator context
	* @throws IOException
	* @throws InterruptedException
	*/
	@Override
	public Context getIllustratorContext(Job job,
	List<Pair<PigNullableWritable, Writable>> input, POPackage pkg) throws IOException, InterruptedException {
	return new IllustratorContext(job, input, pkg);
	}

	@SuppressWarnings("unchecked")
	public class IllustratorContext extends Context {
	private PigNullableWritable currentKey = null, nextKey = null;
	private NullableTuple nextValue = null;
	private List<NullableTuple> currentValues = null;
	private Iterator<Pair<PigNullableWritable, Writable>> it;
	private final ByteArrayOutputStream bos;
	private final DataOutputStream dos;
	private final RawComparator sortComparator, groupingComparator;
	POPackage pack = null;

	public IllustratorContext(Job job,
	List<Pair<PigNullableWritable, Writable>> input,
	POPackage pkg
	) throws IOException, InterruptedException {
	super(job.getJobConf(), new TaskAttemptID(), new FakeRawKeyValueIterator(input.iterator().hasNext()),
	null, null, null, null, new IllustrateDummyReporter(), null, PigNullableWritable.class, NullableTuple.class);
	bos = new ByteArrayOutputStream();
	dos = new DataOutputStream(bos);
	org.apache.hadoop.mapreduce.Job nwJob = new org.apache.hadoop.mapreduce.Job(job.getJobConf());
	sortComparator = nwJob.getSortComparator();
	groupingComparator = nwJob.getGroupingComparator();

	Collections.sort(input, new Comparator<Pair<PigNullableWritable, Writable>>() {
	@Override
	public int compare(Pair<PigNullableWritable, Writable> o1,
	Pair<PigNullableWritable, Writable> o2) {
	try {
	o1.first.write(dos);
	int l1 = bos.size();
	o2.first.write(dos);
	int l2 = bos.size();
	byte[] bytes = bos.toByteArray();
	bos.reset();
	return sortComparator.compare(bytes, 0, l1, bytes, l1, l2-l1);
	} catch (IOException e) {
	throw new RuntimeException("Serialization exception in sort:"+e.getMessage());
	}
	}
	}
	);
	currentValues = new ArrayList<NullableTuple>();
	it = input.iterator();
	if (it.hasNext()) {
	Pair<PigNullableWritable, Writable> entry = it.next();
	nextKey = entry.first;
	nextValue = (NullableTuple) entry.second;
	}
	pack = pkg;
	}

	@Override
	public PigNullableWritable getCurrentKey() {
	return currentKey;
	}

	@Override
	public boolean nextKey() {
	if (nextKey == null)
	return false;
	currentKey = nextKey;
	currentValues.clear();
	currentValues.add(nextValue);
	nextKey = null;
	for(; it.hasNext(); ) {
	Pair<PigNullableWritable, Writable> entry = it.next();
	/* Why can't raw comparison be used?
	byte[] bytes;
	int l1, l2;
	try {
	currentKey.write(dos);
	l1 = bos.size();
	entry.first.write(dos);
	l2 = bos.size();
	bytes = bos.toByteArray();
	} catch (IOException e) {
	throw new RuntimeException("nextKey exception : "+e.getMessage());
	}
	bos.reset();
	if (groupingComparator.compare(bytes, 0, l1, bytes, l1, l2-l1) == 0)
	*/
	if (groupingComparator.compare(currentKey, entry.first) == 0)
	{
	currentValues.add((NullableTuple)entry.second);
	} else {
	nextKey = entry.first;
	nextValue = (NullableTuple) entry.second;
	break;
	}
	}
	return true;
	}

	@Override
	public Iterable<NullableTuple> getValues() {
	return currentValues;
	}

	@Override
	public void write(PigNullableWritable k, Writable t) {
	}

	@Override
	public void progress() {
	}
	}

	@Override
	public boolean inIllustrator(
	org.apache.hadoop.mapreduce.Reducer.Context context) {
	return (context instanceof PigMapReduce.Reduce.IllustratorContext);
	}

	@Override
	public POPackage getPack(
	org.apache.hadoop.mapreduce.Reducer.Context context) {
	return ((PigMapReduce.Reduce.IllustratorContext) context).pack;
	}
	}
	}