spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java - iceberg - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.iceberg;

 import com.esotericsoftware.kryo.Kryo;
 import com.esotericsoftware.kryo.io.Input;
 import com.esotericsoftware.kryo.io.Output;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.util.List;
 import java.util.Map;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.spark.SparkTestBase;
 import org.apache.iceberg.spark.source.ThreeColumnRecord;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.SparkConf;
 import org.apache.spark.serializer.KryoSerializer;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;

 import static org.apache.iceberg.types.Types.NestedField.optional;

 public abstract class TestScanTaskSerialization extends SparkTestBase {

   private static final HadoopTables TABLES = new HadoopTables(new Configuration());
   private static final Schema SCHEMA = new Schema(
       optional(1, "c1", Types.IntegerType.get()),
       optional(2, "c2", Types.StringType.get()),
       optional(3, "c3", Types.StringType.get())
   );

   @Rule
   public TemporaryFolder temp = new TemporaryFolder();

   private String tableLocation = null;

   @Before
   public void setupTableLocation() throws Exception {
     File tableDir = temp.newFolder();
     this.tableLocation = tableDir.toURI().toString();
   }

   @Test
   public void testBaseCombinedScanTaskKryoSerialization() throws Exception {
     BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest();

     File data = temp.newFile();
     Assert.assertTrue(data.delete());
     Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();

     try (Output out = new Output(new FileOutputStream(data))) {
       kryo.writeClassAndObject(out, scanTask);
     }

     try (Input in = new Input(new FileInputStream(data))) {
       Object obj = kryo.readClassAndObject(in);
       Assert.assertTrue("Should be a BaseCombinedScanTask", obj instanceof BaseCombinedScanTask);
       TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj);
     }
   }

   @Test
   public void testBaseCombinedScanTaskJavaSerialization() throws Exception {
     BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest();

     ByteArrayOutputStream bytes = new ByteArrayOutputStream();
     try (ObjectOutputStream out = new ObjectOutputStream(bytes)) {
       out.writeObject(scanTask);
     }

     try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) {
       Object obj = in.readObject();
       Assert.assertTrue("Should be a BaseCombinedScanTask", obj instanceof BaseCombinedScanTask);
       TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj);
     }
   }

   private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() {
     PartitionSpec spec = PartitionSpec.unpartitioned();
     Map<String, String> options = Maps.newHashMap();
     Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

     List<ThreeColumnRecord> records1 = Lists.newArrayList(
         new ThreeColumnRecord(1, null, "AAAA"),
         new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")
     );
     writeRecords(records1);

     List<ThreeColumnRecord> records2 = Lists.newArrayList(
         new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"),
         new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")
     );
     writeRecords(records2);

     table.refresh();

     CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
     return new BaseCombinedScanTask(Lists.newArrayList(tasks));
   }

   private void writeRecords(List<ThreeColumnRecord> records) {
     Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
     writeDF(df);
   }

   private void writeDF(Dataset<Row> df) {
     df.select("c1", "c2", "c3")
         .write()
         .format("iceberg")
         .mode("append")
         .save(tableLocation);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.iceberg;

	import com.esotericsoftware.kryo.Kryo;
	import com.esotericsoftware.kryo.io.Input;
	import com.esotericsoftware.kryo.io.Output;
	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.ObjectInputStream;
	import java.io.ObjectOutputStream;
	import java.util.List;
	import java.util.Map;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.iceberg.hadoop.HadoopTables;
	import org.apache.iceberg.io.CloseableIterable;
	import org.apache.iceberg.relocated.com.google.common.collect.Lists;
	import org.apache.iceberg.relocated.com.google.common.collect.Maps;
	import org.apache.iceberg.spark.SparkTestBase;
	import org.apache.iceberg.spark.source.ThreeColumnRecord;
	import org.apache.iceberg.types.Types;
	import org.apache.spark.SparkConf;
	import org.apache.spark.serializer.KryoSerializer;
	import org.apache.spark.sql.Dataset;
	import org.apache.spark.sql.Row;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Rule;
	import org.junit.Test;
	import org.junit.rules.TemporaryFolder;

	import static org.apache.iceberg.types.Types.NestedField.optional;

	public abstract class TestScanTaskSerialization extends SparkTestBase {

	private static final HadoopTables TABLES = new HadoopTables(new Configuration());
	private static final Schema SCHEMA = new Schema(
	optional(1, "c1", Types.IntegerType.get()),
	optional(2, "c2", Types.StringType.get()),
	optional(3, "c3", Types.StringType.get())
	);

	@Rule
	public TemporaryFolder temp = new TemporaryFolder();

	private String tableLocation = null;

	@Before
	public void setupTableLocation() throws Exception {
	File tableDir = temp.newFolder();
	this.tableLocation = tableDir.toURI().toString();
	}

	@Test
	public void testBaseCombinedScanTaskKryoSerialization() throws Exception {
	BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest();

	File data = temp.newFile();
	Assert.assertTrue(data.delete());
	Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();

	try (Output out = new Output(new FileOutputStream(data))) {
	kryo.writeClassAndObject(out, scanTask);
	}

	try (Input in = new Input(new FileInputStream(data))) {
	Object obj = kryo.readClassAndObject(in);
	Assert.assertTrue("Should be a BaseCombinedScanTask", obj instanceof BaseCombinedScanTask);
	TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj);
	}
	}

	@Test
	public void testBaseCombinedScanTaskJavaSerialization() throws Exception {
	BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest();

	ByteArrayOutputStream bytes = new ByteArrayOutputStream();
	try (ObjectOutputStream out = new ObjectOutputStream(bytes)) {
	out.writeObject(scanTask);
	}

	try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) {
	Object obj = in.readObject();
	Assert.assertTrue("Should be a BaseCombinedScanTask", obj instanceof BaseCombinedScanTask);
	TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj);
	}
	}

	private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() {
	PartitionSpec spec = PartitionSpec.unpartitioned();
	Map<String, String> options = Maps.newHashMap();
	Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

	List<ThreeColumnRecord> records1 = Lists.newArrayList(
	new ThreeColumnRecord(1, null, "AAAA"),
	new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")
	);
	writeRecords(records1);

	List<ThreeColumnRecord> records2 = Lists.newArrayList(
	new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"),
	new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")
	);
	writeRecords(records2);

	table.refresh();

	CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
	return new BaseCombinedScanTask(Lists.newArrayList(tasks));
	}

	private void writeRecords(List<ThreeColumnRecord> records) {
	Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
	writeDF(df);
	}

	private void writeDF(Dataset<Row> df) {
	df.select("c1", "c2", "c3")
	.write()
	.format("iceberg")
	.mode("append")
	.save(tableLocation);
	}
	}