blob: deb66cc879c309e4cb15f4a1b6de2cb2164fc183 [file] [log] [blame]
/**
* Copyright 2012 Twitter, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parquet.hadoop;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
import parquet.column.Encoding;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ColumnChunkMetaData;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.metadata.FileMetaData;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
public class TestInputFormat {
@Test
public void testBlocksToSplits() throws IOException, InterruptedException {
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (int i = 0; i < 10; i++) {
blocks.add(newBlock(i * 10));
}
BlockLocation[] hdfsBlocks = new BlockLocation[] {
new BlockLocation(new String[0], new String[] { "foo0.datanode", "bar0.datanode"}, 0, 50),
new BlockLocation(new String[0], new String[] { "foo1.datanode", "bar1.datanode"}, 50, 50)
};
FileStatus fileStatus = new FileStatus(100, false, 2, 50, 0, new Path("hdfs://foo.namenode:1234/bar"));
MessageType schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
@SuppressWarnings("serial")
List<ParquetInputSplit> splits = ParquetInputFormat.generateSplits(
blocks, hdfsBlocks, fileStatus, fileMetaData, ReadSupport.class, schema.toString(), new HashMap<String, String>() {{put("specific", "foo");}});
assertEquals(splits.toString().replaceAll("([{])", "$0\n").replaceAll("([}])", "\n$0"), 2, splits.size());
for (int i = 0; i < splits.size(); i++) {
ParquetInputSplit parquetInputSplit = splits.get(i);
assertEquals(5, parquetInputSplit.getBlocks().size());
assertEquals(2, parquetInputSplit.getLocations().length);
assertEquals("[foo" + i + ".datanode, bar" + i + ".datanode]", Arrays.toString(parquetInputSplit.getLocations()));
assertEquals(10, parquetInputSplit.getLength());
assertEquals("foo", parquetInputSplit.getReadSupportMetadata().get("specific"));
}
}
private BlockMetaData newBlock(long start) {
BlockMetaData blockMetaData = new BlockMetaData();
ColumnChunkMetaData column = new ColumnChunkMetaData(new String[] {"foo"}, PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, Arrays.asList(Encoding.PLAIN));
column.setFirstDataPageOffset(start);
column.setTotalSize(2);
blockMetaData.addColumn(column);
return blockMetaData;
}
}