crunch-core/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java - crunch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.crunch;

 import static org.junit.Assert.assertEquals;

 import java.io.File;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.List;

 import org.apache.crunch.impl.mr.MRPipeline;
 import org.apache.crunch.test.TemporaryPath;
 import org.apache.crunch.test.TemporaryPaths;
 import org.apache.crunch.types.PTypeFamily;
 import org.apache.crunch.types.avro.AvroTypeFamily;
 import org.apache.crunch.types.writable.WritableTypeFamily;
 import org.junit.Rule;
 import org.junit.Test;

 import com.google.common.io.Files;


 public class TupleNClassCastBugIT {
   @Rule
   public TemporaryPath tmpDir = TemporaryPaths.create();

   public static PCollection<TupleN> mapGroupDo(PCollection<String> lines, PTypeFamily ptf) {
     PTable<String, TupleN> mapped = lines.parallelDo(new MapFn<String, Pair<String, TupleN>>() {

       @Override
       public Pair<String, TupleN> map(String line) {
         String[] columns = line.split("\\t");
         String docId = columns[0];
         String docLine = columns[1];
         return Pair.of(docId, new TupleN(docId, docLine));
       }
     }, ptf.tableOf(ptf.strings(), ptf.tuples(ptf.strings(), ptf.strings())));
     return mapped.groupByKey().parallelDo(new DoFn<Pair<String, Iterable<TupleN>>, TupleN>() {
       @Override
       public void process(Pair<String, Iterable<TupleN>> input, Emitter<TupleN> tupleNEmitter) {
         for (TupleN tuple : input.second()) {
           tupleNEmitter.emit(tuple);
         }
       }
     }, ptf.tuples(ptf.strings(), ptf.strings()));
   }

   @Test
   public void testWritables() throws IOException {
     run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
   }

   @Test
   public void testAvro() throws IOException {
     run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
   }

   public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
     String inputPath = tmpDir.copyResourceFileName("docs.txt");
     String outputPath = tmpDir.getFileName("output");

     PCollection<String> docLines = pipeline.readTextFile(inputPath);
     pipeline.writeTextFile(mapGroupDo(docLines, typeFamily), outputPath);
     pipeline.done();

     // *** We are not directly testing the output, we are looking for a
     // ClassCastException
     // *** which is thrown in a different thread during the reduce phase. If all
     // is well
     // *** the file will exist and have six lines. Otherwise the bug is present.
     File outputFile = new File(outputPath, "part-r-00000");
     List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
     int lineCount = 0;
     for (String line : lines) {
       lineCount++;
     }
     assertEquals(6, lineCount);
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.crunch;

	import static org.junit.Assert.assertEquals;

	import java.io.File;
	import java.io.IOException;
	import java.nio.charset.Charset;
	import java.util.List;

	import org.apache.crunch.impl.mr.MRPipeline;
	import org.apache.crunch.test.TemporaryPath;
	import org.apache.crunch.test.TemporaryPaths;
	import org.apache.crunch.types.PTypeFamily;
	import org.apache.crunch.types.avro.AvroTypeFamily;
	import org.apache.crunch.types.writable.WritableTypeFamily;
	import org.junit.Rule;
	import org.junit.Test;

	import com.google.common.io.Files;


	public class TupleNClassCastBugIT {
	@Rule
	public TemporaryPath tmpDir = TemporaryPaths.create();

	public static PCollection<TupleN> mapGroupDo(PCollection<String> lines, PTypeFamily ptf) {
	PTable<String, TupleN> mapped = lines.parallelDo(new MapFn<String, Pair<String, TupleN>>() {

	@Override
	public Pair<String, TupleN> map(String line) {
	String[] columns = line.split("\\t");
	String docId = columns[0];
	String docLine = columns[1];
	return Pair.of(docId, new TupleN(docId, docLine));
	}
	}, ptf.tableOf(ptf.strings(), ptf.tuples(ptf.strings(), ptf.strings())));
	return mapped.groupByKey().parallelDo(new DoFn<Pair<String, Iterable<TupleN>>, TupleN>() {
	@Override
	public void process(Pair<String, Iterable<TupleN>> input, Emitter<TupleN> tupleNEmitter) {
	for (TupleN tuple : input.second()) {
	tupleNEmitter.emit(tuple);
	}
	}
	}, ptf.tuples(ptf.strings(), ptf.strings()));
	}

	@Test
	public void testWritables() throws IOException {
	run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
	}

	@Test
	public void testAvro() throws IOException {
	run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
	}

	public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
	String inputPath = tmpDir.copyResourceFileName("docs.txt");
	String outputPath = tmpDir.getFileName("output");

	PCollection<String> docLines = pipeline.readTextFile(inputPath);
	pipeline.writeTextFile(mapGroupDo(docLines, typeFamily), outputPath);
	pipeline.done();

	// *** We are not directly testing the output, we are looking for a
	// ClassCastException
	// *** which is thrown in a different thread during the reduce phase. If all
	// is well
	// *** the file will exist and have six lines. Otherwise the bug is present.
	File outputFile = new File(outputPath, "part-r-00000");
	List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
	int lineCount = 0;
	for (String line : lines) {
	lineCount++;
	}
	assertEquals(6, lineCount);
	}
	}