spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java - iceberg - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.iceberg;

 import com.esotericsoftware.kryo.Kryo;
 import com.esotericsoftware.kryo.io.Input;
 import com.esotericsoftware.kryo.io.Output;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.Map;
 import java.util.UUID;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.spark.data.RandomData;
 import org.apache.iceberg.spark.data.SparkParquetWriters;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.SparkConf;
 import org.apache.spark.serializer.KryoSerializer;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.junit.Assert;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;

 import static org.apache.iceberg.TaskCheckHelper.assertEquals;
 import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;

 public class TestDataFileSerialization {

   private static final Schema DATE_SCHEMA = new Schema(
       required(1, "id", Types.LongType.get()),
       optional(2, "data", Types.StringType.get()),
       required(3, "date", Types.StringType.get()),
       optional(4, "double", Types.DoubleType.get()));

   private static final PartitionSpec PARTITION_SPEC = PartitionSpec
       .builderFor(DATE_SCHEMA)
       .identity("date")
       .build();

   private static final Map<Integer, Long> VALUE_COUNTS = Maps.newHashMap();
   private static final Map<Integer, Long> NULL_VALUE_COUNTS = Maps.newHashMap();
   private static final Map<Integer, Long> NAN_VALUE_COUNTS = Maps.newHashMap();
   private static final Map<Integer, ByteBuffer> LOWER_BOUNDS = Maps.newHashMap();
   private static final Map<Integer, ByteBuffer> UPPER_BOUNDS = Maps.newHashMap();

   static {
     VALUE_COUNTS.put(1, 5L);
     VALUE_COUNTS.put(2, 3L);
     VALUE_COUNTS.put(4, 2L);
     NULL_VALUE_COUNTS.put(1, 0L);
     NULL_VALUE_COUNTS.put(2, 2L);
     NAN_VALUE_COUNTS.put(4, 1L);
     LOWER_BOUNDS.put(1, longToBuffer(0L));
     UPPER_BOUNDS.put(1, longToBuffer(4L));
   }

   private static final DataFile DATA_FILE = DataFiles
       .builder(PARTITION_SPEC)
       .withPath("/path/to/data-1.parquet")
       .withFileSizeInBytes(1234)
       .withPartitionPath("date=2018-06-08")
       .withMetrics(new Metrics(
           5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS))
       .withSplitOffsets(ImmutableList.of(4L))
       .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34))
       .build();

   @Rule
   public TemporaryFolder temp = new TemporaryFolder();

   @Test
   public void testDataFileKryoSerialization() throws Exception {
     File data = temp.newFile();
     Assert.assertTrue(data.delete());
     Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();

     try (Output out = new Output(new FileOutputStream(data))) {
       kryo.writeClassAndObject(out, DATA_FILE);
       kryo.writeClassAndObject(out, DATA_FILE.copy());
     }

     try (Input in = new Input(new FileInputStream(data))) {
       for (int i = 0; i < 2; i += 1) {
         Object obj = kryo.readClassAndObject(in);
         Assert.assertTrue("Should be a DataFile", obj instanceof DataFile);
         assertEquals(DATA_FILE, (DataFile) obj);
       }
     }
   }

   @Test
   public void testDataFileJavaSerialization() throws Exception {
     ByteArrayOutputStream bytes = new ByteArrayOutputStream();
     try (ObjectOutputStream out = new ObjectOutputStream(bytes)) {
       out.writeObject(DATA_FILE);
       out.writeObject(DATA_FILE.copy());
     }

     try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) {
       for (int i = 0; i < 2; i += 1) {
         Object obj = in.readObject();
         Assert.assertTrue("Should be a DataFile", obj instanceof DataFile);
         assertEquals(DATA_FILE, (DataFile) obj);
       }
     }
   }

   @Test
   public void testParquetWriterSplitOffsets() throws IOException {
     Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
     File parquetFile = new File(
         temp.getRoot(),
         FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
     FileAppender<InternalRow> writer =
         Parquet.write(Files.localOutput(parquetFile))
             .schema(DATE_SCHEMA)
             .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType))
             .build();
     try {
       writer.addAll(records);
     } finally {
       writer.close();
     }

     Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
     File dataFile = temp.newFile();
     try (Output out = new Output(new FileOutputStream(dataFile))) {
       kryo.writeClassAndObject(out, writer.splitOffsets());
     }
     try (Input in = new Input(new FileInputStream(dataFile))) {
       kryo.readClassAndObject(in);
     }
   }

   private static ByteBuffer longToBuffer(long value) {
     return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.iceberg;

	import com.esotericsoftware.kryo.Kryo;
	import com.esotericsoftware.kryo.io.Input;
	import com.esotericsoftware.kryo.io.Output;
	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.ObjectInputStream;
	import java.io.ObjectOutputStream;
	import java.nio.ByteBuffer;
	import java.nio.ByteOrder;
	import java.util.Map;
	import java.util.UUID;
	import org.apache.iceberg.io.FileAppender;
	import org.apache.iceberg.parquet.Parquet;
	import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
	import org.apache.iceberg.relocated.com.google.common.collect.Maps;
	import org.apache.iceberg.spark.SparkSchemaUtil;
	import org.apache.iceberg.spark.data.RandomData;
	import org.apache.iceberg.spark.data.SparkParquetWriters;
	import org.apache.iceberg.types.Types;
	import org.apache.spark.SparkConf;
	import org.apache.spark.serializer.KryoSerializer;
	import org.apache.spark.sql.catalyst.InternalRow;
	import org.junit.Assert;
	import org.junit.Rule;
	import org.junit.Test;
	import org.junit.rules.TemporaryFolder;

	import static org.apache.iceberg.TaskCheckHelper.assertEquals;
	import static org.apache.iceberg.types.Types.NestedField.optional;
	import static org.apache.iceberg.types.Types.NestedField.required;

	public class TestDataFileSerialization {

	private static final Schema DATE_SCHEMA = new Schema(
	required(1, "id", Types.LongType.get()),
	optional(2, "data", Types.StringType.get()),
	required(3, "date", Types.StringType.get()),
	optional(4, "double", Types.DoubleType.get()));

	private static final PartitionSpec PARTITION_SPEC = PartitionSpec
	.builderFor(DATE_SCHEMA)
	.identity("date")
	.build();

	private static final Map<Integer, Long> VALUE_COUNTS = Maps.newHashMap();
	private static final Map<Integer, Long> NULL_VALUE_COUNTS = Maps.newHashMap();
	private static final Map<Integer, Long> NAN_VALUE_COUNTS = Maps.newHashMap();
	private static final Map<Integer, ByteBuffer> LOWER_BOUNDS = Maps.newHashMap();
	private static final Map<Integer, ByteBuffer> UPPER_BOUNDS = Maps.newHashMap();

	static {
	VALUE_COUNTS.put(1, 5L);
	VALUE_COUNTS.put(2, 3L);
	VALUE_COUNTS.put(4, 2L);
	NULL_VALUE_COUNTS.put(1, 0L);
	NULL_VALUE_COUNTS.put(2, 2L);
	NAN_VALUE_COUNTS.put(4, 1L);
	LOWER_BOUNDS.put(1, longToBuffer(0L));
	UPPER_BOUNDS.put(1, longToBuffer(4L));
	}

	private static final DataFile DATA_FILE = DataFiles
	.builder(PARTITION_SPEC)
	.withPath("/path/to/data-1.parquet")
	.withFileSizeInBytes(1234)
	.withPartitionPath("date=2018-06-08")
	.withMetrics(new Metrics(
	5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS))
	.withSplitOffsets(ImmutableList.of(4L))
	.withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34))
	.build();

	@Rule
	public TemporaryFolder temp = new TemporaryFolder();

	@Test
	public void testDataFileKryoSerialization() throws Exception {
	File data = temp.newFile();
	Assert.assertTrue(data.delete());
	Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();

	try (Output out = new Output(new FileOutputStream(data))) {
	kryo.writeClassAndObject(out, DATA_FILE);
	kryo.writeClassAndObject(out, DATA_FILE.copy());
	}

	try (Input in = new Input(new FileInputStream(data))) {
	for (int i = 0; i < 2; i += 1) {
	Object obj = kryo.readClassAndObject(in);
	Assert.assertTrue("Should be a DataFile", obj instanceof DataFile);
	assertEquals(DATA_FILE, (DataFile) obj);
	}
	}
	}

	@Test
	public void testDataFileJavaSerialization() throws Exception {
	ByteArrayOutputStream bytes = new ByteArrayOutputStream();
	try (ObjectOutputStream out = new ObjectOutputStream(bytes)) {
	out.writeObject(DATA_FILE);
	out.writeObject(DATA_FILE.copy());
	}

	try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) {
	for (int i = 0; i < 2; i += 1) {
	Object obj = in.readObject();
	Assert.assertTrue("Should be a DataFile", obj instanceof DataFile);
	assertEquals(DATA_FILE, (DataFile) obj);
	}
	}
	}

	@Test
	public void testParquetWriterSplitOffsets() throws IOException {
	Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
	File parquetFile = new File(
	temp.getRoot(),
	FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
	FileAppender<InternalRow> writer =
	Parquet.write(Files.localOutput(parquetFile))
	.schema(DATE_SCHEMA)
	.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType))
	.build();
	try {
	writer.addAll(records);
	} finally {
	writer.close();
	}

	Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
	File dataFile = temp.newFile();
	try (Output out = new Output(new FileOutputStream(dataFile))) {
	kryo.writeClassAndObject(out, writer.splitOffsets());
	}
	try (Input in = new Input(new FileInputStream(dataFile))) {
	kryo.readClassAndObject(in);
	}
	}

	private static ByteBuffer longToBuffer(long value) {
	return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value);
	}
	}