flink-tests/src/test/java/org/apache/flink/test/operators/CustomDistributionITCase.java - flink - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.flink.test.operators;

 import org.apache.flink.api.common.distributions.DataDistribution;
 import org.apache.flink.api.common.functions.RichMapPartitionFunction;
 import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
 import org.apache.flink.api.common.typeinfo.TypeInformation;
 import org.apache.flink.api.java.DataSet;
 import org.apache.flink.api.java.ExecutionEnvironment;
 import org.apache.flink.api.java.io.DiscardingOutputFormat;
 import org.apache.flink.api.java.tuple.Tuple3;
 import org.apache.flink.api.java.utils.DataSetUtils;
 import org.apache.flink.core.memory.DataInputView;
 import org.apache.flink.core.memory.DataOutputView;
 import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
 import org.apache.flink.test.operators.util.CollectionDataSets;
 import org.apache.flink.test.util.MiniClusterWithClientResource;
 import org.apache.flink.testutils.junit.category.AlsoRunWithSchedulerNG;
 import org.apache.flink.util.Collector;
 import org.apache.flink.util.TestLogger;

 import org.junit.ClassRule;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;

 import java.io.IOException;

 import static org.junit.Assert.fail;

 /**
  * Integration tests for custom {@link DataDistribution}.
  */
 @SuppressWarnings("serial")
 @Category(AlsoRunWithSchedulerNG.class)
 public class CustomDistributionITCase extends TestLogger {

 	@ClassRule
 	public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = new MiniClusterWithClientResource(
 		new MiniClusterResourceConfiguration.Builder()
 			.setNumberTaskManagers(1)
 			.setNumberSlotsPerTaskManager(8)
 			.build());

 	// ------------------------------------------------------------------------

 	/**
 	 * Test the record partitioned rightly with one field according to the customized data distribution.
 	 */
 	@Test
 	public void testPartitionWithDistribution1() throws Exception {
 		final TestDataDist1 dist = new TestDataDist1();

 		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
 		env.setParallelism(dist.getParallelism());

 		DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);

 		DataSet<Boolean> result = DataSetUtils
 			.partitionByRange(input, dist, 0)
 			.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, Long, String>, Boolean>() {

 				@Override
 				public void mapPartition(Iterable<Tuple3<Integer, Long, String>> values, Collector<Boolean> out) throws Exception {
 					int pIdx = getRuntimeContext().getIndexOfThisSubtask();

 					for (Tuple3<Integer, Long, String> s : values) {
 						boolean correctlyPartitioned = true;
 						if (pIdx == 0) {
 							Integer[] upper = dist.boundaries[0];
 							if (s.f0.compareTo(upper[0]) > 0) {
 								correctlyPartitioned = false;
 							}
 						}
 						else if (pIdx > 0 && pIdx < dist.getParallelism() - 1) {
 							Integer[] lower = dist.boundaries[pIdx - 1];
 							Integer[] upper = dist.boundaries[pIdx];
 							if (s.f0.compareTo(upper[0]) > 0 || (s.f0.compareTo(lower[0]) <= 0)) {
 								correctlyPartitioned = false;
 							}
 						}
 						else {
 							Integer[] lower = dist.boundaries[pIdx - 1];
 							if ((s.f0.compareTo(lower[0]) <= 0)) {
 								correctlyPartitioned = false;
 							}
 						}

 						if (!correctlyPartitioned) {
 							fail("Record was not correctly partitioned: " + s.toString());
 						}
 					}
 				}
 			}
 			);

 		result.output(new DiscardingOutputFormat<Boolean>());
 		env.execute();
 	}

 	/**
 	 * Test the record partitioned rightly with two fields according to the customized data distribution.
 	 */
 	@Test
 	public void testRangeWithDistribution2() throws Exception {
 		final TestDataDist2 dist = new TestDataDist2();

 		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
 		env.setParallelism(dist.getParallelism());

 		DataSet<Tuple3<Integer, Integer, String>> input = env.fromElements(
 						new Tuple3<>(1, 5, "Hi"),
 						new Tuple3<>(1, 6, "Hi"),
 						new Tuple3<>(1, 7, "Hi"),
 						new Tuple3<>(1, 11, "Hello"),
 						new Tuple3<>(2, 3, "World"),
 						new Tuple3<>(2, 4, "World"),
 						new Tuple3<>(2, 5, "World"),
 						new Tuple3<>(2, 13, "Hello World"),
 						new Tuple3<>(3, 8, "Say"),
 						new Tuple3<>(4, 0, "Why"),
 						new Tuple3<>(4, 2, "Java"),
 						new Tuple3<>(4, 11, "Say Hello"),
 						new Tuple3<>(5, 1, "Hi Java!"),
 						new Tuple3<>(5, 2, "Hi Java?"),
 						new Tuple3<>(5, 3, "Hi Java again")
 			);

 		DataSet<Boolean> result = DataSetUtils
 			.partitionByRange(input, dist, 0, 1)
 			.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, Integer, String>, Boolean>() {

 				@Override
 				public void mapPartition(Iterable<Tuple3<Integer, Integer, String>> values, Collector<Boolean> out) throws Exception {
 					int pIdx = getRuntimeContext().getIndexOfThisSubtask();
 					boolean correctlyPartitioned = true;

 					for (Tuple3<Integer, Integer, String> s : values) {

 						if (pIdx == 0) {
 							Integer[] upper = dist.boundaries[0];
 							if (s.f0.compareTo(upper[0]) > 0 ||
 								(s.f0.compareTo(upper[0]) == 0 && s.f1.compareTo(upper[1]) > 0)) {
 								correctlyPartitioned = false;
 							}
 						}
 						else if (pIdx > 0 && pIdx < dist.getParallelism() - 1) {
 							Integer[] lower = dist.boundaries[pIdx - 1];
 							Integer[] upper = dist.boundaries[pIdx];

 							if (s.f0.compareTo(upper[0]) > 0 ||
 								(s.f0.compareTo(upper[0]) == 0 && s.f1.compareTo(upper[1]) > 0) ||
 								(s.f0.compareTo(lower[0]) < 0) ||
 								(s.f0.compareTo(lower[0]) == 0 && s.f1.compareTo(lower[1]) <= 0)) {
 								correctlyPartitioned = false;
 							}
 						}
 						else {
 							Integer[] lower = dist.boundaries[pIdx - 1];
 							if ((s.f0.compareTo(lower[0]) < 0) ||
 								(s.f0.compareTo(lower[0]) == 0 && s.f1.compareTo(lower[1]) <= 0)) {
 								correctlyPartitioned = false;
 							}
 						}

 						if (!correctlyPartitioned) {
 							fail("Record was not correctly partitioned: " + s.toString());
 						}
 					}
 				}
 			}
 			);

 		result.output(new DiscardingOutputFormat<Boolean>());
 		env.execute();
 	}

 	/*
 	 * Test the number of partition keys less than the number of distribution fields
 	 */
 	@Test
 	public void testPartitionKeyLessDistribution() throws Exception {
 		final TestDataDist2 dist = new TestDataDist2();

 		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
 		env.setParallelism(dist.getParallelism());

 		DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);

 		DataSet<Boolean> result = DataSetUtils
 			.partitionByRange(input, dist, 0)
 			.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, Long, String>, Boolean>() {

 				@Override
 				public void mapPartition(Iterable<Tuple3<Integer, Long, String>> values, Collector<Boolean> out) throws Exception {
 					int pIdx = getRuntimeContext().getIndexOfThisSubtask();

 					for (Tuple3<Integer, Long, String> s : values) {
 						boolean correctlyPartitioned = true;
 						if (pIdx == 0) {
 							Integer[] upper = dist.boundaries[0];
 							if (s.f0.compareTo(upper[0]) > 0) {
 								correctlyPartitioned = false;
 							}
 						}
 						else if (pIdx > 0 && pIdx < dist.getParallelism() - 1) {
 							Integer[] lower = dist.boundaries[pIdx - 1];
 							Integer[] upper = dist.boundaries[pIdx];
 							if (s.f0.compareTo(upper[0]) > 0 || (s.f0.compareTo(lower[0]) <= 0)) {
 								correctlyPartitioned = false;
 							}
 						}
 						else {
 							Integer[] lower = dist.boundaries[pIdx - 1];
 							if ((s.f0.compareTo(lower[0]) <= 0)) {
 								correctlyPartitioned = false;
 							}
 						}

 						if (!correctlyPartitioned) {
 							fail("Record was not correctly partitioned: " + s.toString());
 						}
 					}
 				}
 			}
 			);

 		result.output(new DiscardingOutputFormat<Boolean>());
 		env.execute();
 	}

 	/*
 	 * Test the number of partition keys larger than the number of distribution fields
 	 */
 	@Test(expected = IllegalArgumentException.class)
 	public void testPartitionMoreThanDistribution() throws Exception {
 		final TestDataDist2 dist = new TestDataDist2();

 		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

 		DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);
 		DataSetUtils.partitionByRange(input, dist, 0, 1, 2);
 	}

 	/**
 	 * The class is used to do the tests of range partition with one key.
 	 */
 	public static class TestDataDist1 implements DataDistribution {

 		public Integer[][] boundaries = new Integer[][]{
 			new Integer[]{4},
 			new Integer[]{9},
 			new Integer[]{13},
 			new Integer[]{18}
 		};

 		public TestDataDist1() {}

 		public int getParallelism() {
 			return boundaries.length;
 		}

 		@Override
 		public Object[] getBucketBoundary(int bucketNum, int totalNumBuckets) {
 			return boundaries[bucketNum];
 		}

 		@Override
 		public int getNumberOfFields() {
 			return 1;
 		}

 		@Override
 		public TypeInformation[] getKeyTypes() {
 			return new TypeInformation[]{BasicTypeInfo.INT_TYPE_INFO};
 		}

 		@Override
 		public void write(DataOutputView out) throws IOException {}

 		@Override
 		public void read(DataInputView in) throws IOException {}
 	}

 	/**
 	 * The class is used to do the tests of range partition with two keys.
 	 */
 	public static class TestDataDist2 implements DataDistribution {

 		public Integer[][] boundaries = new Integer[][]{
 			new Integer[]{1, 6},
 			new Integer[]{2, 4},
 			new Integer[]{3, 9},
 			new Integer[]{4, 1},
 			new Integer[]{5, 2}
 		};

 		public TestDataDist2() {}

 		public int getParallelism() {
 			return boundaries.length;
 		}

 		@Override
 		public Object[] getBucketBoundary(int bucketNum, int totalNumBuckets) {
 			return boundaries[bucketNum];
 		}

 		@Override
 		public int getNumberOfFields() {
 			return 2;
 		}

 		@Override
 		public TypeInformation[] getKeyTypes() {
 			return new TypeInformation[]{BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO};
 		}

 		@Override
 		public void write(DataOutputView out) throws IOException {}

 		@Override
 		public void read(DataInputView in) throws IOException {}
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.flink.test.operators;

	import org.apache.flink.api.common.distributions.DataDistribution;
	import org.apache.flink.api.common.functions.RichMapPartitionFunction;
	import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
	import org.apache.flink.api.common.typeinfo.TypeInformation;
	import org.apache.flink.api.java.DataSet;
	import org.apache.flink.api.java.ExecutionEnvironment;
	import org.apache.flink.api.java.io.DiscardingOutputFormat;
	import org.apache.flink.api.java.tuple.Tuple3;
	import org.apache.flink.api.java.utils.DataSetUtils;
	import org.apache.flink.core.memory.DataInputView;
	import org.apache.flink.core.memory.DataOutputView;
	import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
	import org.apache.flink.test.operators.util.CollectionDataSets;
	import org.apache.flink.test.util.MiniClusterWithClientResource;
	import org.apache.flink.testutils.junit.category.AlsoRunWithSchedulerNG;
	import org.apache.flink.util.Collector;
	import org.apache.flink.util.TestLogger;

	import org.junit.ClassRule;
	import org.junit.Test;
	import org.junit.experimental.categories.Category;

	import java.io.IOException;

	import static org.junit.Assert.fail;

	/**
	* Integration tests for custom {@link DataDistribution}.
	*/
	@SuppressWarnings("serial")
	@Category(AlsoRunWithSchedulerNG.class)
	public class CustomDistributionITCase extends TestLogger {

	@ClassRule
	public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = new MiniClusterWithClientResource(
	new MiniClusterResourceConfiguration.Builder()
	.setNumberTaskManagers(1)
	.setNumberSlotsPerTaskManager(8)
	.build());

	// ------------------------------------------------------------------------

	/**
	* Test the record partitioned rightly with one field according to the customized data distribution.
	*/
	@Test
	public void testPartitionWithDistribution1() throws Exception {
	final TestDataDist1 dist = new TestDataDist1();

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(dist.getParallelism());

	DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);

	DataSet<Boolean> result = DataSetUtils
	.partitionByRange(input, dist, 0)
	.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, Long, String>, Boolean>() {

	@Override
	public void mapPartition(Iterable<Tuple3<Integer, Long, String>> values, Collector<Boolean> out) throws Exception {
	int pIdx = getRuntimeContext().getIndexOfThisSubtask();

	for (Tuple3<Integer, Long, String> s : values) {
	boolean correctlyPartitioned = true;
	if (pIdx == 0) {
	Integer[] upper = dist.boundaries[0];
	if (s.f0.compareTo(upper[0]) > 0) {
	correctlyPartitioned = false;
	}
	}
	else if (pIdx > 0 && pIdx < dist.getParallelism() - 1) {
	Integer[] lower = dist.boundaries[pIdx - 1];
	Integer[] upper = dist.boundaries[pIdx];
	if (s.f0.compareTo(upper[0]) > 0 \|\| (s.f0.compareTo(lower[0]) <= 0)) {
	correctlyPartitioned = false;
	}
	}
	else {
	Integer[] lower = dist.boundaries[pIdx - 1];
	if ((s.f0.compareTo(lower[0]) <= 0)) {
	correctlyPartitioned = false;
	}
	}

	if (!correctlyPartitioned) {
	fail("Record was not correctly partitioned: " + s.toString());
	}
	}
	}
	}
	);

	result.output(new DiscardingOutputFormat<Boolean>());
	env.execute();
	}

	/**
	* Test the record partitioned rightly with two fields according to the customized data distribution.
	*/
	@Test
	public void testRangeWithDistribution2() throws Exception {
	final TestDataDist2 dist = new TestDataDist2();

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(dist.getParallelism());

	DataSet<Tuple3<Integer, Integer, String>> input = env.fromElements(
	new Tuple3<>(1, 5, "Hi"),
	new Tuple3<>(1, 6, "Hi"),
	new Tuple3<>(1, 7, "Hi"),
	new Tuple3<>(1, 11, "Hello"),
	new Tuple3<>(2, 3, "World"),
	new Tuple3<>(2, 4, "World"),
	new Tuple3<>(2, 5, "World"),
	new Tuple3<>(2, 13, "Hello World"),
	new Tuple3<>(3, 8, "Say"),
	new Tuple3<>(4, 0, "Why"),
	new Tuple3<>(4, 2, "Java"),
	new Tuple3<>(4, 11, "Say Hello"),
	new Tuple3<>(5, 1, "Hi Java!"),
	new Tuple3<>(5, 2, "Hi Java?"),
	new Tuple3<>(5, 3, "Hi Java again")
	);

	DataSet<Boolean> result = DataSetUtils
	.partitionByRange(input, dist, 0, 1)
	.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, Integer, String>, Boolean>() {

	@Override
	public void mapPartition(Iterable<Tuple3<Integer, Integer, String>> values, Collector<Boolean> out) throws Exception {
	int pIdx = getRuntimeContext().getIndexOfThisSubtask();
	boolean correctlyPartitioned = true;

	for (Tuple3<Integer, Integer, String> s : values) {

	if (pIdx == 0) {
	Integer[] upper = dist.boundaries[0];
	if (s.f0.compareTo(upper[0]) > 0 \|\|
	(s.f0.compareTo(upper[0]) == 0 && s.f1.compareTo(upper[1]) > 0)) {
	correctlyPartitioned = false;
	}
	}
	else if (pIdx > 0 && pIdx < dist.getParallelism() - 1) {
	Integer[] lower = dist.boundaries[pIdx - 1];
	Integer[] upper = dist.boundaries[pIdx];

	if (s.f0.compareTo(upper[0]) > 0 \|\|
	(s.f0.compareTo(upper[0]) == 0 && s.f1.compareTo(upper[1]) > 0) \|\|
	(s.f0.compareTo(lower[0]) < 0) \|\|
	(s.f0.compareTo(lower[0]) == 0 && s.f1.compareTo(lower[1]) <= 0)) {
	correctlyPartitioned = false;
	}
	}
	else {
	Integer[] lower = dist.boundaries[pIdx - 1];
	if ((s.f0.compareTo(lower[0]) < 0) \|\|
	(s.f0.compareTo(lower[0]) == 0 && s.f1.compareTo(lower[1]) <= 0)) {
	correctlyPartitioned = false;
	}
	}

	if (!correctlyPartitioned) {
	fail("Record was not correctly partitioned: " + s.toString());
	}
	}
	}
	}
	);

	result.output(new DiscardingOutputFormat<Boolean>());
	env.execute();
	}

	/*
	* Test the number of partition keys less than the number of distribution fields
	*/
	@Test
	public void testPartitionKeyLessDistribution() throws Exception {
	final TestDataDist2 dist = new TestDataDist2();

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(dist.getParallelism());

	DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);

	DataSet<Boolean> result = DataSetUtils
	.partitionByRange(input, dist, 0)
	.mapPartition(new RichMapPartitionFunction<Tuple3<Integer, Long, String>, Boolean>() {

	@Override
	public void mapPartition(Iterable<Tuple3<Integer, Long, String>> values, Collector<Boolean> out) throws Exception {
	int pIdx = getRuntimeContext().getIndexOfThisSubtask();

	for (Tuple3<Integer, Long, String> s : values) {
	boolean correctlyPartitioned = true;
	if (pIdx == 0) {
	Integer[] upper = dist.boundaries[0];
	if (s.f0.compareTo(upper[0]) > 0) {
	correctlyPartitioned = false;
	}
	}
	else if (pIdx > 0 && pIdx < dist.getParallelism() - 1) {
	Integer[] lower = dist.boundaries[pIdx - 1];
	Integer[] upper = dist.boundaries[pIdx];
	if (s.f0.compareTo(upper[0]) > 0 \|\| (s.f0.compareTo(lower[0]) <= 0)) {
	correctlyPartitioned = false;
	}
	}
	else {
	Integer[] lower = dist.boundaries[pIdx - 1];
	if ((s.f0.compareTo(lower[0]) <= 0)) {
	correctlyPartitioned = false;
	}
	}

	if (!correctlyPartitioned) {
	fail("Record was not correctly partitioned: " + s.toString());
	}
	}
	}
	}
	);

	result.output(new DiscardingOutputFormat<Boolean>());
	env.execute();
	}

	/*
	* Test the number of partition keys larger than the number of distribution fields
	*/
	@Test(expected = IllegalArgumentException.class)
	public void testPartitionMoreThanDistribution() throws Exception {
	final TestDataDist2 dist = new TestDataDist2();

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);
	DataSetUtils.partitionByRange(input, dist, 0, 1, 2);
	}

	/**
	* The class is used to do the tests of range partition with one key.
	*/
	public static class TestDataDist1 implements DataDistribution {

	public Integer[][] boundaries = new Integer[][]{
	new Integer[]{4},
	new Integer[]{9},
	new Integer[]{13},
	new Integer[]{18}
	};

	public TestDataDist1() {}

	public int getParallelism() {
	return boundaries.length;
	}

	@Override
	public Object[] getBucketBoundary(int bucketNum, int totalNumBuckets) {
	return boundaries[bucketNum];
	}

	@Override
	public int getNumberOfFields() {
	return 1;
	}

	@Override
	public TypeInformation[] getKeyTypes() {
	return new TypeInformation[]{BasicTypeInfo.INT_TYPE_INFO};
	}

	@Override
	public void write(DataOutputView out) throws IOException {}

	@Override
	public void read(DataInputView in) throws IOException {}
	}

	/**
	* The class is used to do the tests of range partition with two keys.
	*/
	public static class TestDataDist2 implements DataDistribution {

	public Integer[][] boundaries = new Integer[][]{
	new Integer[]{1, 6},
	new Integer[]{2, 4},
	new Integer[]{3, 9},
	new Integer[]{4, 1},
	new Integer[]{5, 2}
	};

	public TestDataDist2() {}

	public int getParallelism() {
	return boundaries.length;
	}

	@Override
	public Object[] getBucketBoundary(int bucketNum, int totalNumBuckets) {
	return boundaries[bucketNum];
	}

	@Override
	public int getNumberOfFields() {
	return 2;
	}

	@Override
	public TypeInformation[] getKeyTypes() {
	return new TypeInformation[]{BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO};
	}

	@Override
	public void write(DataOutputView out) throws IOException {}

	@Override
	public void read(DataInputView in) throws IOException {}
	}
	}