data/src/test/java/org/apache/iceberg/TestSplitScan.java - iceberg - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.iceberg;

 import java.io.File;
 import java.io.IOException;
 import java.util.List;
 import java.util.Locale;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.data.GenericAppenderFactory;
 import org.apache.iceberg.data.IcebergGenerics;
 import org.apache.iceberg.data.RandomGenericData;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.Types;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;

 import static org.apache.iceberg.types.Types.NestedField.required;

 @RunWith(Parameterized.class)
 public class TestSplitScan {
   private static final Configuration CONF = new Configuration();
   private static final HadoopTables TABLES = new HadoopTables(CONF);

   private static final long SPLIT_SIZE = 16 * 1024 * 1024;

   private static final Schema SCHEMA = new Schema(
       required(1, "id", Types.IntegerType.get()),
       required(2, "data", Types.StringType.get())
   );

   private Table table;
   private File tableLocation;

   @Rule
   public TemporaryFolder temp = new TemporaryFolder();
   private List<Record> expectedRecords;

   @Parameterized.Parameters(name = "format = {0}")
   public static Object[] parameters() {
     return new Object[] { "parquet", "avro" };
   }

   private final FileFormat format;

   public TestSplitScan(String format) {
     this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
   }

   @Before
   public void before() throws IOException {
     tableLocation = new File(temp.newFolder(), "table");
     setupTable();
   }

   @Test
   public void test() {
     Assert.assertEquals(
         "There should be 4 tasks created since file size is approximately close to 64MB and split size 16MB",
         4, Lists.newArrayList(table.newScan().planTasks()).size());
     List<Record> records = Lists.newArrayList(IcebergGenerics.read(table).build());
     Assert.assertEquals(expectedRecords.size(), records.size());
     for (int i = 0; i < expectedRecords.size(); i++) {
       Assert.assertEquals(expectedRecords.get(i), records.get(i));
     }
   }

   private void setupTable() throws IOException {
     table = TABLES.create(SCHEMA, tableLocation.toString());
     table.updateProperties()
         .set(TableProperties.SPLIT_SIZE, String.valueOf(SPLIT_SIZE))
         .commit();

     // With these number of records and the given SCHEMA
     // we can effectively write a file of approximate size 64 MB
     int numRecords = 2500000;
     expectedRecords = RandomGenericData.generate(SCHEMA, numRecords, 0L);
     File file = writeToFile(expectedRecords, format);

     DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned())
         .withRecordCount(expectedRecords.size())
         .withFileSizeInBytes(file.length())
         .withPath(file.toString())
         .withFormat(format)
         .build();

     table.newAppend().appendFile(dataFile).commit();
   }

   private File writeToFile(List<Record> records, FileFormat fileFormat) throws IOException {
     File file = temp.newFile();
     Assert.assertTrue(file.delete());

     GenericAppenderFactory factory = new GenericAppenderFactory(SCHEMA).set(
         TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(SPLIT_SIZE));
     try (FileAppender<Record> appender = factory.newAppender(Files.localOutput(file), fileFormat)) {
       appender.addAll(records);
     }
     return file;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.iceberg;

	import java.io.File;
	import java.io.IOException;
	import java.util.List;
	import java.util.Locale;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.iceberg.data.GenericAppenderFactory;
	import org.apache.iceberg.data.IcebergGenerics;
	import org.apache.iceberg.data.RandomGenericData;
	import org.apache.iceberg.data.Record;
	import org.apache.iceberg.hadoop.HadoopTables;
	import org.apache.iceberg.io.FileAppender;
	import org.apache.iceberg.relocated.com.google.common.collect.Lists;
	import org.apache.iceberg.types.Types;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Rule;
	import org.junit.Test;
	import org.junit.rules.TemporaryFolder;
	import org.junit.runner.RunWith;
	import org.junit.runners.Parameterized;

	import static org.apache.iceberg.types.Types.NestedField.required;

	@RunWith(Parameterized.class)
	public class TestSplitScan {
	private static final Configuration CONF = new Configuration();
	private static final HadoopTables TABLES = new HadoopTables(CONF);

	private static final long SPLIT_SIZE = 16 * 1024 * 1024;

	private static final Schema SCHEMA = new Schema(
	required(1, "id", Types.IntegerType.get()),
	required(2, "data", Types.StringType.get())
	);

	private Table table;
	private File tableLocation;

	@Rule
	public TemporaryFolder temp = new TemporaryFolder();
	private List<Record> expectedRecords;

	@Parameterized.Parameters(name = "format = {0}")
	public static Object[] parameters() {
	return new Object[] { "parquet", "avro" };
	}

	private final FileFormat format;

	public TestSplitScan(String format) {
	this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
	}

	@Before
	public void before() throws IOException {
	tableLocation = new File(temp.newFolder(), "table");
	setupTable();
	}

	@Test
	public void test() {
	Assert.assertEquals(
	"There should be 4 tasks created since file size is approximately close to 64MB and split size 16MB",
	4, Lists.newArrayList(table.newScan().planTasks()).size());
	List<Record> records = Lists.newArrayList(IcebergGenerics.read(table).build());
	Assert.assertEquals(expectedRecords.size(), records.size());
	for (int i = 0; i < expectedRecords.size(); i++) {
	Assert.assertEquals(expectedRecords.get(i), records.get(i));
	}
	}

	private void setupTable() throws IOException {
	table = TABLES.create(SCHEMA, tableLocation.toString());
	table.updateProperties()
	.set(TableProperties.SPLIT_SIZE, String.valueOf(SPLIT_SIZE))
	.commit();

	// With these number of records and the given SCHEMA
	// we can effectively write a file of approximate size 64 MB
	int numRecords = 2500000;
	expectedRecords = RandomGenericData.generate(SCHEMA, numRecords, 0L);
	File file = writeToFile(expectedRecords, format);

	DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned())
	.withRecordCount(expectedRecords.size())
	.withFileSizeInBytes(file.length())
	.withPath(file.toString())
	.withFormat(format)
	.build();

	table.newAppend().appendFile(dataFile).commit();
	}

	private File writeToFile(List<Record> records, FileFormat fileFormat) throws IOException {
	File file = temp.newFile();
	Assert.assertTrue(file.delete());

	GenericAppenderFactory factory = new GenericAppenderFactory(SCHEMA).set(
	TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(SPLIT_SIZE));
	try (FileAppender<Record> appender = factory.newAppender(Files.localOutput(file), fileFormat)) {
	appender.addAll(records);
	}
	return file;
	}
	}