parquet-hadoop/src/test/java/parquet/hadoop/TestInputFormat.java - parquet-mr - Git at Google

 /**
  * Copyright 2012 Twitter, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package parquet.hadoop;

 import static org.junit.Assert.assertEquals;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;

 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.junit.Test;

 import parquet.column.Encoding;
 import parquet.hadoop.api.ReadSupport;
 import parquet.hadoop.metadata.BlockMetaData;
 import parquet.hadoop.metadata.ColumnChunkMetaData;
 import parquet.hadoop.metadata.CompressionCodecName;
 import parquet.hadoop.metadata.FileMetaData;
 import parquet.schema.MessageType;
 import parquet.schema.MessageTypeParser;
 import parquet.schema.PrimitiveType.PrimitiveTypeName;

 public class TestInputFormat {

   @Test
   public void testBlocksToSplits() throws IOException, InterruptedException {
     List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
     for (int i = 0; i < 10; i++) {
       blocks.add(newBlock(i * 10));
     }
     BlockLocation[] hdfsBlocks = new BlockLocation[] {
         new BlockLocation(new String[0], new String[] { "foo0.datanode", "bar0.datanode"}, 0, 50),
         new BlockLocation(new String[0], new String[] { "foo1.datanode", "bar1.datanode"}, 50, 50)
     };
     FileStatus fileStatus = new FileStatus(100, false, 2, 50, 0, new Path("hdfs://foo.namenode:1234/bar"));
     MessageType schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
     FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
     @SuppressWarnings("serial")
     List<ParquetInputSplit> splits = ParquetInputFormat.generateSplits(
         blocks, hdfsBlocks, fileStatus, fileMetaData, ReadSupport.class, schema.toString(), new HashMap<String, String>() {{put("specific", "foo");}});
     assertEquals(splits.toString().replaceAll("([{])", "$0\n").replaceAll("([}])", "\n$0"), 2, splits.size());
     for (int i = 0; i < splits.size(); i++) {
       ParquetInputSplit parquetInputSplit = splits.get(i);
       assertEquals(5, parquetInputSplit.getBlocks().size());
       assertEquals(2, parquetInputSplit.getLocations().length);
       assertEquals("[foo" + i + ".datanode, bar" + i + ".datanode]", Arrays.toString(parquetInputSplit.getLocations()));
       assertEquals(10, parquetInputSplit.getLength());
       assertEquals("foo", parquetInputSplit.getReadSupportMetadata().get("specific"));
     }
   }

   private BlockMetaData newBlock(long start) {
     BlockMetaData blockMetaData = new BlockMetaData();
     ColumnChunkMetaData column = new ColumnChunkMetaData(new String[] {"foo"}, PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, Arrays.asList(Encoding.PLAIN));
     column.setFirstDataPageOffset(start);
     column.setTotalSize(2);
     blockMetaData.addColumn(column);
     return blockMetaData;
   }
 }
	/**
	* Copyright 2012 Twitter, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package parquet.hadoop;

	import static org.junit.Assert.assertEquals;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashMap;
	import java.util.List;

	import org.apache.hadoop.fs.BlockLocation;
	import org.apache.hadoop.fs.FileStatus;
	import org.apache.hadoop.fs.Path;
	import org.junit.Test;

	import parquet.column.Encoding;
	import parquet.hadoop.api.ReadSupport;
	import parquet.hadoop.metadata.BlockMetaData;
	import parquet.hadoop.metadata.ColumnChunkMetaData;
	import parquet.hadoop.metadata.CompressionCodecName;
	import parquet.hadoop.metadata.FileMetaData;
	import parquet.schema.MessageType;
	import parquet.schema.MessageTypeParser;
	import parquet.schema.PrimitiveType.PrimitiveTypeName;

	public class TestInputFormat {

	@Test
	public void testBlocksToSplits() throws IOException, InterruptedException {
	List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
	for (int i = 0; i < 10; i++) {
	blocks.add(newBlock(i * 10));
	}
	BlockLocation[] hdfsBlocks = new BlockLocation[] {
	new BlockLocation(new String[0], new String[] { "foo0.datanode", "bar0.datanode"}, 0, 50),
	new BlockLocation(new String[0], new String[] { "foo1.datanode", "bar1.datanode"}, 50, 50)
	};
	FileStatus fileStatus = new FileStatus(100, false, 2, 50, 0, new Path("hdfs://foo.namenode:1234/bar"));
	MessageType schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
	FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
	@SuppressWarnings("serial")
	List<ParquetInputSplit> splits = ParquetInputFormat.generateSplits(
	blocks, hdfsBlocks, fileStatus, fileMetaData, ReadSupport.class, schema.toString(), new HashMap<String, String>() {{put("specific", "foo");}});
	assertEquals(splits.toString().replaceAll("([{])", "$0\n").replaceAll("([}])", "\n$0"), 2, splits.size());
	for (int i = 0; i < splits.size(); i++) {
	ParquetInputSplit parquetInputSplit = splits.get(i);
	assertEquals(5, parquetInputSplit.getBlocks().size());
	assertEquals(2, parquetInputSplit.getLocations().length);
	assertEquals("[foo" + i + ".datanode, bar" + i + ".datanode]", Arrays.toString(parquetInputSplit.getLocations()));
	assertEquals(10, parquetInputSplit.getLength());
	assertEquals("foo", parquetInputSplit.getReadSupportMetadata().get("specific"));
	}
	}

	private BlockMetaData newBlock(long start) {
	BlockMetaData blockMetaData = new BlockMetaData();
	ColumnChunkMetaData column = new ColumnChunkMetaData(new String[] {"foo"}, PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, Arrays.asList(Encoding.PLAIN));
	column.setFirstDataPageOffset(start);
	column.setTotalSize(2);
	blockMetaData.addColumn(column);
	return blockMetaData;
	}
	}