| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.pig.test; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertTrue; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.PrintStream; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Properties; |
| |
| import junit.framework.Assert; |
| |
| import org.apache.pig.EvalFunc; |
| import org.apache.pig.ExecType; |
| import org.apache.pig.PigServer; |
| import org.apache.pig.ResourceSchema.ResourceFieldSchema; |
| import org.apache.pig.builtin.PigStorage; |
| import org.apache.pig.data.DataBag; |
| import org.apache.pig.data.DataType; |
| import org.apache.pig.data.DefaultDataBag; |
| import org.apache.pig.data.Tuple; |
| import org.apache.pig.impl.PigContext; |
| import org.apache.pig.impl.io.FileLocalizer; |
| import org.apache.pig.impl.util.Utils; |
| import org.apache.pig.newplan.logical.relational.LogicalSchema; |
| import org.apache.pig.newplan.logical.relational.LogicalSchema.LogicalFieldSchema; |
| import org.junit.AfterClass; |
| import org.junit.Test; |
| |
| public class TestCombiner { |
| |
| static MiniCluster cluster = MiniCluster.buildCluster(); |
| |
| @AfterClass |
| public static void oneTimeTearDown() throws Exception { |
| cluster.shutDown(); |
| } |
| |
| |
| @Test |
| public void testSuccessiveUserFuncs1() throws Exception{ |
| String query = "a = load 'students.txt' as (c1,c2,c3,c4); " + |
| "c = group a by c2; " + |
| "f = foreach c generate COUNT(org.apache.pig.builtin.Distinct($1.$2)); " + |
| "store f into 'out';"; |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| PigContext pc = pigServer.getPigContext(); |
| assertTrue((Util.buildMRPlan(Util.buildPp(pigServer,query),pc).getRoots().get(0).combinePlan.isEmpty())); |
| } |
| |
| @Test |
| public void testSuccessiveUserFuncs2() throws Exception { |
| String dummyUDF = JiraPig1030.class.getName(); |
| String query = "a = load 'students.txt' as (c1,c2,c3,c4); " + |
| "c = group a by c2; " + |
| "f = foreach c generate COUNT(" + dummyUDF + "" + |
| "(org.apache.pig.builtin.Distinct($1.$2),"+dummyUDF+"())); " + |
| "store f into 'out';"; |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| PigContext pc = pigServer.getPigContext(); |
| assertTrue((Util.buildMRPlan(Util.buildPp(pigServer,query),pc).getRoots().get(0).combinePlan.isEmpty())); |
| } |
| |
| @Test |
| public void testOnCluster() throws Exception { |
| // run the test on cluster |
| String inputFileName = runTest(new PigServer( |
| ExecType.MAPREDUCE, cluster.getProperties())); |
| Util.deleteFile(cluster, inputFileName); |
| |
| } |
| |
| /* (non-Javadoc) |
| * @see junit.framework.TestCase#setUp() |
| */ |
| protected void setUp() throws Exception { |
| // cause a re initialization of FileLocalizer's |
| // internal state before each test run |
| // A previous test might have been in a different |
| // mode than the test which is about to run. To |
| // ensure each test runs correctly in it's exectype |
| // mode, let's re initialize. |
| FileLocalizer.setInitialized(false); |
| } |
| |
| @Test |
| public void testLocal() throws Exception { |
| // run the test locally |
| FileLocalizer.deleteTempFiles(); |
| runTest(new PigServer(ExecType.LOCAL, new Properties())); |
| FileLocalizer.deleteTempFiles(); |
| } |
| |
| |
| private String runTest(PigServer pig) throws IOException { |
| List<String> inputLines = new ArrayList<String>(); |
| inputLines.add("a,b,1"); |
| inputLines.add("a,b,1"); |
| inputLines.add("a,c,1"); |
| String inputFileName = loadWithTestLoadFunc("A", pig, inputLines); |
| |
| pig.registerQuery("B = group A by ($0, $1);"); |
| pig.registerQuery("C = foreach B generate flatten(group), COUNT($1);"); |
| Iterator<Tuple> resultIterator = pig.openIterator("C"); |
| Tuple tuple = resultIterator.next(); |
| assertEquals("(a,b,2)", tuple.toString()); |
| tuple = resultIterator.next(); |
| assertEquals("(a,c,1)", tuple.toString()); |
| |
| return inputFileName; |
| } |
| |
| private String loadWithTestLoadFunc(String loadAlias, PigServer pig, |
| List<String> inputLines) throws IOException { |
| File inputFile = File.createTempFile("test", "txt"); |
| inputFile.deleteOnExit(); |
| String inputFileName = inputFile.getAbsolutePath(); |
| if(pig.getPigContext().getExecType() == ExecType.LOCAL) { |
| PrintStream ps = new PrintStream(new FileOutputStream(inputFile)); |
| for (String line : inputLines) { |
| ps.println(line); |
| } |
| ps.close(); |
| } else { |
| inputFileName = Util.removeColon(inputFileName); |
| Util.createInputFile(cluster, inputFileName, inputLines.toArray(new String[] {})); |
| } |
| pig.registerQuery(loadAlias + " = load '" |
| + Util.encodeEscape(inputFileName) + "' using " |
| + PigStorage.class.getName() + "(',');"); |
| return inputFileName; |
| } |
| |
| @Test |
| public void testNoCombinerUse() { |
| // To simulate this, we will have two input files |
| // with exactly one input record - this should result |
| // in two map tasks and each would process only one record |
| // hence the combiner would not get called. |
| } |
| |
| @Test |
| public void testMultiCombinerUse() throws Exception { |
| // test the scenario where the combiner is called multiple |
| // times - this can happen when the output of the map > io.sort.mb |
| // let's set the io.sort.mb to 1MB and > 1 MB map data. |
| String[] input = new String[500*1024]; |
| for(int i = 0; i < input.length; i++) { |
| if(i % 2 == 0) { |
| input[i] = Integer.toString(1); |
| } else { |
| input[i] = Integer.toString(0); |
| } |
| } |
| Util.createInputFile(cluster, "MultiCombinerUseInput.txt", input); |
| Properties props = cluster.getProperties(); |
| props.setProperty("io.sort.mb", "1"); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, props); |
| pigServer.registerQuery("a = load 'MultiCombinerUseInput.txt' as (x:int);"); |
| pigServer.registerQuery("b = group a all;"); |
| pigServer.registerQuery("c = foreach b generate COUNT(a), SUM(a.$0), " + |
| "MIN(a.$0), MAX(a.$0), AVG(a.$0), ((double)SUM(a.$0))/COUNT(a.$0)," + |
| " COUNT(a.$0) + SUM(a.$0) + MAX(a.$0);"); |
| |
| // make sure there is a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| assertTrue(baos.toString().matches("(?si).*combine plan.*")); |
| |
| Iterator<Tuple> it = pigServer.openIterator("c"); |
| Tuple t = it.next(); |
| assertEquals(512000L, t.get(0)); |
| assertEquals(256000L, t.get(1)); |
| assertEquals(0, t.get(2)); |
| assertEquals(1, t.get(3)); |
| assertEquals(0.5, t.get(4)); |
| assertEquals(0.5, t.get(5)); |
| assertEquals(512000L + 256000L + 1, t.get(6)); |
| |
| assertFalse(it.hasNext()); |
| Util.deleteFile(cluster, "MultiCombinerUseInput.txt"); |
| } |
| |
| @Test |
| public void testDistinctAggs1() throws Exception { |
| // test the use of combiner for distinct aggs: |
| String input[] = { |
| "pig1\t18\t2.1", |
| "pig2\t24\t3.3", |
| "pig5\t45\t2.4", |
| "pig1\t18\t2.1", |
| "pig1\t19\t2.1", |
| "pig2\t24\t4.5", |
| "pig1\t20\t3.1" }; |
| |
| Util.createInputFile(cluster, "distinctAggs1Input.txt", input); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'distinctAggs1Input.txt' as (name:chararray, age:int, gpa:double);"); |
| pigServer.registerQuery("b = group a by name;"); |
| pigServer.registerQuery("c = foreach b {" + |
| " x = distinct a.age;" + |
| " y = distinct a.gpa;" + |
| " z = distinct a;" + |
| " generate group, COUNT(x), SUM(x.age), SUM(y.gpa), SUM(a.age), " + |
| " SUM(a.gpa), COUNT(z.age), COUNT(z), SUM(z.age);};"); |
| |
| // make sure there is a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| assertTrue(baos.toString().matches("(?si).*combine plan.*")); |
| |
| HashMap<String, Object[]> results = new HashMap<String, Object[]>(); |
| results.put("pig1", new Object[] {"pig1",3L,57L,5.2,75L,9.4,3L,3L,57L}); |
| results.put("pig2", new Object[] {"pig2",1L,24L,7.8,48L,7.8,2L,2L,48L}); |
| results.put("pig5", new Object[] {"pig5",1L,45L,2.4,45L,2.4,1L,1L,45L}); |
| Iterator<Tuple> it = pigServer.openIterator("c"); |
| while(it.hasNext()) { |
| Tuple t = it.next(); |
| List<Object> fields = t.getAll(); |
| Object[] expected = results.get((String)fields.get(0)); |
| int i = 0; |
| for (Object field : fields) { |
| assertEquals(expected[i++], field); |
| } |
| } |
| Util.deleteFile(cluster, "distinctAggs1Input.txt"); |
| |
| } |
| |
| @Test |
| public void testGroupElements() throws Exception { |
| // test use of combiner when group elements are accessed in the foreach |
| String input[] = { |
| "ABC\t1\ta\t1", |
| "ABC\t1\tb\t2", |
| "ABC\t1\ta\t3", |
| "ABC\t2\tb\t4", |
| "DEF\t1\td\t1", |
| "XYZ\t1\tx\t2" |
| }; |
| |
| Util.createInputFile(cluster, "testGroupElements.txt", input); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'testGroupElements.txt' as (str:chararray, num1:int, alph : chararray, num2 : int);"); |
| pigServer.registerQuery("b = group a by (str, num1);"); |
| |
| //check if combiner is present or not for various forms of foreach |
| pigServer.registerQuery("c = foreach b generate flatten(group), COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", true); |
| |
| pigServer.registerQuery("c = foreach b generate group, COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", true); |
| |
| // projecting bag - combiner should not be used |
| pigServer.registerQuery("c = foreach b generate group, a, COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", false); |
| |
| // projecting bag - combiner should not be used |
| pigServer.registerQuery("c = foreach b generate group, a.num2, COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", false); |
| |
| pigServer.registerQuery("c = foreach b generate group.$0, group.$1, COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", true); |
| |
| pigServer.registerQuery("c = foreach b generate group.$0, group.$1 + COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", true); |
| |
| pigServer.registerQuery("c = foreach b generate group.str, group.$1, COUNT(a.alph), SUM(a.num2); "); |
| checkCombinerUsed(pigServer, "c", true); |
| |
| pigServer.registerQuery("c = foreach b generate group.str, group.$1, COUNT(a.alph), SUM(a.num2), " + |
| " (group.num1 == 1 ? (COUNT(a.num2) + 1) : (SUM(a.num2) + 10)) ; "); |
| checkCombinerUsed(pigServer, "c", true); |
| |
| List<Tuple> expectedRes = |
| Util.getTuplesFromConstantTupleStrings( |
| new String[] { |
| "('ABC',1,3L,6L,4L)", |
| "('ABC',2,1L,4L,14L)", |
| "('DEF',1,1L,1L,2L)", |
| "('XYZ',1,1L,2L,2L)", |
| }); |
| |
| Iterator<Tuple> it = pigServer.openIterator("c"); |
| Util.checkQueryOutputsAfterSort(it, expectedRes); |
| |
| Util.deleteFile(cluster, "distinctAggs1Input.txt"); |
| |
| } |
| |
| @Test |
| public void testGroupByLimit() throws Exception { |
| // test use of combiner when group elements are accessed in the foreach |
| String input[] = { |
| "ABC 1", |
| "ABC 2", |
| "DEF 1", |
| "XYZ 1", |
| "XYZ 2", |
| "XYZ 3", |
| }; |
| |
| Util.createInputFile(cluster, "testGroupLimit.txt", input); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'testGroupLimit.txt' using PigStorage(' ') " + |
| "as (str:chararray, num1:int) ;"); |
| pigServer.registerQuery("b = group a by str;"); |
| |
| |
| pigServer.registerQuery("c = foreach b generate group, COUNT(a.num1) ; "); |
| |
| //check if combiner is present |
| pigServer.registerQuery("d = limit c 2 ; "); |
| checkCombinerUsed(pigServer, "d", true); |
| |
| List<Tuple> expectedRes = |
| Util.getTuplesFromConstantTupleStrings( |
| new String[] { |
| "('ABC',2L)", |
| "('DEF',1L)", |
| }); |
| |
| Iterator<Tuple> it = pigServer.openIterator("d"); |
| Util.checkQueryOutputsAfterSort(it, expectedRes); |
| |
| |
| } |
| |
| private void checkCombinerUsed(PigServer pigServer, String string, boolean combineExpected) |
| throws IOException { |
| // make sure there is a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| boolean combinerFound = baos.toString().matches("(?si).*combine plan.*"); |
| System.out.println(baos.toString()); |
| assertEquals("is combiner present as expected", combineExpected, combinerFound); |
| } |
| |
| |
| @Test |
| public void testDistinctNoCombiner() throws Exception { |
| // test that combiner is NOT invoked when |
| // one of the elements in the foreach generate |
| // is a distinct() as the leaf |
| String input[] = { |
| "pig1\t18\t2.1", |
| "pig2\t24\t3.3", |
| "pig5\t45\t2.4", |
| "pig1\t18\t2.1", |
| "pig1\t19\t2.1", |
| "pig2\t24\t4.5", |
| "pig1\t20\t3.1" }; |
| |
| Util.createInputFile(cluster, "distinctNoCombinerInput.txt", input); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'distinctNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);"); |
| pigServer.registerQuery("b = group a by name;"); |
| pigServer.registerQuery("c = foreach b {" + |
| " z = distinct a;" + |
| " generate group, z, SUM(a.age), SUM(a.gpa);};"); |
| |
| // make sure there is a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| assertFalse(baos.toString().matches("(?si).*combine plan.*")); |
| |
| HashMap<String, Object[]> results = new HashMap<String, Object[]>(); |
| results.put("pig1", new Object[] {"pig1","bag-place-holder",75L,9.4}); |
| results.put("pig2", new Object[] {"pig2","bag-place-holder",48L,7.8}); |
| results.put("pig5", new Object[] {"pig5","bag-place-holder",45L,2.4}); |
| Iterator<Tuple> it = pigServer.openIterator("c"); |
| while(it.hasNext()) { |
| Tuple t = it.next(); |
| List<Object> fields = t.getAll(); |
| Object[] expected = results.get((String)fields.get(0)); |
| int i = 0; |
| for (Object field : fields) { |
| if(i == 1) { |
| // ignore the second field which is a bag |
| // for comparison here |
| continue; |
| } |
| assertEquals(expected[i++], field); |
| } |
| } |
| Util.deleteFile(cluster, "distinctNoCombinerInput.txt"); |
| |
| } |
| |
| @Test |
| public void testForEachNoCombiner() throws Exception { |
| // test that combiner is NOT invoked when |
| // one of the elements in the foreach generate |
| // has a foreach in the plan without a distinct agg |
| String input[] = { |
| "pig1\t18\t2.1", |
| "pig2\t24\t3.3", |
| "pig5\t45\t2.4", |
| "pig1\t18\t2.1", |
| "pig1\t19\t2.1", |
| "pig2\t24\t4.5", |
| "pig1\t20\t3.1" }; |
| |
| Util.createInputFile(cluster, "forEachNoCombinerInput.txt", input); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'forEachNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);"); |
| pigServer.registerQuery("b = group a by name;"); |
| pigServer.registerQuery("c = foreach b {" + |
| " z = a.age;" + |
| " generate group, z, SUM(a.age), SUM(a.gpa);};"); |
| |
| // make sure there is a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| assertFalse(baos.toString().matches("(?si).*combine plan.*")); |
| |
| HashMap<String, Object[]> results = new HashMap<String, Object[]>(); |
| results.put("pig1", new Object[] {"pig1","bag-place-holder",75L,9.4}); |
| results.put("pig2", new Object[] {"pig2","bag-place-holder",48L,7.8}); |
| results.put("pig5", new Object[] {"pig5","bag-place-holder",45L,2.4}); |
| Iterator<Tuple> it = pigServer.openIterator("c"); |
| while(it.hasNext()) { |
| Tuple t = it.next(); |
| List<Object> fields = t.getAll(); |
| Object[] expected = results.get((String)fields.get(0)); |
| int i = 0; |
| for (Object field : fields) { |
| if(i == 1) { |
| // ignore the second field which is a bag |
| // for comparison here |
| continue; |
| } |
| assertEquals(expected[i++], field); |
| } |
| } |
| Util.deleteFile(cluster, "forEachNoCombinerInput.txt"); |
| |
| } |
| |
| @Test |
| public void testJiraPig746() { |
| // test that combiner is NOT invoked when |
| // one of the elements in the foreach generate |
| // has a foreach in the plan without a distinct agg |
| String input[] = { |
| "pig1\t18\t2.1", |
| "pig2\t24\t3.3", |
| "pig5\t45\t2.4", |
| "pig1\t18\t2.1", |
| "pig1\t19\t2.1", |
| "pig2\t24\t4.5", |
| "pig1\t20\t3.1" }; |
| |
| String expected[] = { |
| "(pig1,75,{(pig1,18,2.1),(pig1,18,2.1),(pig1,19,2.1),(pig1,20,3.1)})", |
| "(pig2,48,{(pig2,24,3.3),(pig2,24,4.5)})", |
| "(pig5,45,{(pig5,45,2.4)})" |
| }; |
| |
| try { |
| Util.createInputFile(cluster, "forEachNoCombinerInput.txt", input); |
| |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'forEachNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);"); |
| pigServer.registerQuery("b = group a by name;"); |
| pigServer.registerQuery("c = foreach b generate group, SUM(a.age), a;"); |
| |
| // make sure there isn't a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| assertFalse(baos.toString().matches("(?si).*combine plan.*")); |
| |
| Iterator<Tuple> it = pigServer.openIterator("c"); |
| Util.checkQueryOutputsAfterSortRecursive(it, expected, "group:chararray,age:long,b:{t:(name:chararray,age:int,gpa:double)}"); |
| } catch (IOException e) { |
| e.printStackTrace(); |
| Assert.fail(); |
| } finally { |
| try { |
| Util.deleteFile(cluster, "forEachNoCombinerInput.txt"); |
| } catch (IOException e) { |
| e.printStackTrace(); |
| Assert.fail(); |
| } |
| } |
| } |
| |
| public static class JiraPig1030 extends EvalFunc<DataBag> { |
| |
| public DataBag exec(Tuple input) throws IOException { |
| return new DefaultDataBag(); |
| } |
| } |
| |
| @Test |
| public void testJiraPig1030() { |
| // test that combiner is NOT invoked when |
| // one of the elements in the foreach generate |
| // has a non-algebraic UDF that have multiple inputs |
| // (one of them is distinct). |
| |
| String input[] = { |
| "pig1\t18\t2.1", |
| "pig2\t24\t3.3", |
| "pig5\t45\t2.4", |
| "pig1\t18\t2.1", |
| "pig1\t19\t2.1", |
| "pig2\t24\t4.5", |
| "pig1\t20\t3.1" }; |
| |
| try { |
| Util.createInputFile(cluster, "forEachNoCombinerInput.txt", input); |
| PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); |
| pigServer.registerQuery("a = load 'forEachNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);"); |
| pigServer.registerQuery("b = group a all;"); |
| pigServer.registerQuery("c = foreach b {" + |
| " d = distinct a.age;" + |
| " generate group, " + JiraPig1030.class.getName() + "(d, 0);};"); |
| |
| // make sure there isn't a combine plan in the explain output |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| PrintStream ps = new PrintStream(baos); |
| pigServer.explain("c", ps); |
| assertFalse(baos.toString().matches("(?si).*combine plan.*")); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| Assert.fail(); |
| } finally { |
| try { |
| Util.deleteFile(cluster, "forEachNoCombinerInput.txt"); |
| } catch (IOException e) { |
| e.printStackTrace(); |
| Assert.fail(); |
| } |
| } |
| } |
| |
| } |